提交 5447046a 编写于 作者: D dzhwinter

merge develop branch

...@@ -36,6 +36,7 @@ include(simd) ...@@ -36,6 +36,7 @@ include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF)
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
...@@ -180,6 +181,11 @@ if(WITH_GPU) ...@@ -180,6 +181,11 @@ if(WITH_GPU)
include(cuda) include(cuda)
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_AMD_GPU)
find_package(HIP)
include(hip)
endif(WITH_AMD_GPU)
if(WITH_MKLML) if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif() endif()
......
...@@ -18,12 +18,13 @@ import sys ...@@ -18,12 +18,13 @@ import sys
import time import time
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.fluid as fluid
import paddle.v2.fluid.core as core import paddle.fluid.core as core
import paddle.v2.fluid.profiler as profiler import paddle.fluid.profiler as profiler
import argparse import argparse
import functools import functools
import os import os
from paddle.fluid import debuger
def str2bool(v): def str2bool(v):
...@@ -182,28 +183,27 @@ def main(): ...@@ -182,28 +183,27 @@ def main():
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
train_pass_acc.reset() train_pass_acc.reset()
with profiler.profiler("CPU", 'total') as prof: for batch_id, data in enumerate(train_reader()):
for batch_id, data in enumerate(train_reader()): ts = time.time()
ts = time.time() img_data = np.array(
img_data = np.array( map(lambda x: x[0].reshape(data_shape), data)).astype(
map(lambda x: x[0].reshape(data_shape), data)).astype( "float32")
"float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1])
y_data = y_data.reshape([-1, 1])
loss, acc, b_size = exe.run(
loss, acc, b_size = exe.run( trainer_prog,
trainer_prog, feed={"pixel": img_data,
feed={"pixel": img_data, "label": y_data},
"label": y_data}, fetch_list=[avg_cost, batch_acc, batch_size])
fetch_list=[avg_cost, batch_acc, batch_size]) iters += 1
iters += 1 num_samples += len(data)
num_samples += len(data) train_pass_acc.add(value=acc, weight=b_size)
train_pass_acc.add(value=acc, weight=b_size) print(
print( "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s" % (pass_id, iters, loss, acc,
% (pass_id, iters, loss, acc, len(data) / (time.time() - ts))
len(data) / (time.time() - ts)) ) # The accuracy is the accumulation of batches, but not the current batch.
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time pass_elapsed = time.time() - start_time
pass_train_acc = train_pass_acc.eval() pass_train_acc = train_pass_acc.eval()
...@@ -254,9 +254,7 @@ def main(): ...@@ -254,9 +254,7 @@ def main():
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog) pserver_prog)
print("starting server side startup")
exe.run(pserver_startup) exe.run(pserver_startup)
print("starting parameter server...")
exe.run(pserver_prog) exe.run(pserver_prog)
elif training_role == "TRAINER": elif training_role == "TRAINER":
# Parameter initialization # Parameter initialization
......
...@@ -292,14 +292,18 @@ def run_benchmark(cluster_spec, server): ...@@ -292,14 +292,18 @@ def run_benchmark(cluster_spec, server):
return np.mean(test_accs) return np.mean(test_accs)
config = tf.ConfigProto( config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1,
log_device_placement=True)
config.gpu_options.allow_growth = True config.gpu_options.allow_growth = True
hooks = [tf.train.StopAtStepHook(last_step=1000000)] hooks = [tf.train.StopAtStepHook(last_step=1000000)]
with tf.train.MonitoredTrainingSession( with tf.train.MonitoredTrainingSession(
master=server.target, is_chief=(args.task_index == 0), master=server.target,
hooks=hooks) as sess: is_chief=(args.task_index == 0),
hooks=hooks,
config=config) as sess:
iters, num_samples, start_time = 0, 0, 0.0 iters, num_samples, start_time = 0, 0, 0.0
for pass_id in range(args.num_passes): for pass_id in range(args.num_passes):
# train # train
......
...@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG) ...@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG) add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG) endif(NOT WITH_GOLANG)
if(NOT WITH_GPU) if(WITH_GPU)
add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else()
add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DPADDLE_WITH_CUDA)
FIND_PACKAGE(CUDA REQUIRED) FIND_PACKAGE(CUDA REQUIRED)
...@@ -84,7 +80,14 @@ else() ...@@ -84,7 +80,14 @@ else()
# Include cuda and cudnn # Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE}) include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU) elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
else()
add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
endif()
if (WITH_MKLML AND MKLML_IOMP_LIB) if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
......
...@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost") ...@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost")
# So we use 1.41.0 here. # So we use 1.41.0 here.
set(BOOST_VER "1.41.0") set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0") set(BOOST_TAR "boost_1_41_0")
set(BOOST_URL "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz") set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
......
...@@ -4,18 +4,33 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) ...@@ -4,18 +4,33 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
ExternalProject_Add( if(WITH_AMD_GPU)
extern_eigen3 ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} extern_eigen3
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git"
PREFIX ${EIGEN_SOURCE_DIR} GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
UPDATE_COMMAND "" PREFIX ${EIGEN_SOURCE_DIR}
CONFIGURE_COMMAND "" UPDATE_COMMAND ""
BUILD_COMMAND "" CONFIGURE_COMMAND ""
INSTALL_COMMAND "" BUILD_COMMAND ""
TEST_COMMAND "" INSTALL_COMMAND ""
) TEST_COMMAND ""
)
else()
ExternalProject_Add(
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10
PREFIX ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
endif()
if (${CMAKE_VERSION} VERSION_LESS "3.3.0") if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
......
...@@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") ...@@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
...@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) ...@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(MKLML)\n" "PROJECT(MKLML)\n"
"cmake_minimum_required(VERSION 3.0)\n" "cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${MKLML_VER}\n" "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
" DESTINATION ${MKLML_DST_DIR})\n") " DESTINATION ${MKLML_DST_DIR})\n")
ExternalProject_Add( ExternalProject_Add(
......
...@@ -317,6 +317,82 @@ function(nv_test TARGET_NAME) ...@@ -317,6 +317,82 @@ function(nv_test TARGET_NAME)
endif() endif()
endfunction(nv_test) endfunction(nv_test)
function(hip_library TARGET_NAME)
if (WITH_AMD_GPU)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_library_SRCS})
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
endif()
if(hip_library_SRCS)
if (hip_library_SHARED OR hip_library_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
else()
add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
find_fluid_modules(${TARGET_NAME})
endif()
if (hip_library_DEPS)
add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
endif()
# cpplint code style
foreach(source_file ${hip_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
else(hip_library_SRCS)
if (hip_library_DEPS)
merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
else()
message(FATAL "Please specify source file or library in nv_library.")
endif()
endif(hip_library_SRCS)
endif()
endfunction(hip_library)
function(hip_binary TARGET_NAME)
if (WITH_AMD_GPU)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
if(hip_binary_DEPS)
target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
endif()
endif()
endfunction(hip_binary)
function(hip_test TARGET_NAME)
if (WITH_AMD_GPU AND WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_test_SRCS})
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
endif()
add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(${TARGET_NAME} ${TARGET_NAME})
endif()
endfunction(hip_test)
function(go_library TARGET_NAME) function(go_library TARGET_NAME)
set(options STATIC static SHARED shared) set(options STATIC static SHARED shared)
set(oneValueArgs "") set(oneValueArgs "")
......
if(NOT WITH_AMD_GPU)
return()
endif()
include_directories("/opt/rocm/include")
include_directories("/opt/rocm/hipblas/include")
include_directories("/opt/rocm/hiprand/include")
include_directories("/opt/rocm/rocrand/include")
include_directories("/opt/rocm/rccl/include")
include_directories("/opt/rocm/thrust")
list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
if(WITH_DSO)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
endif(WITH_DSO)
if(WITH_DOUBLE)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
endif(WITH_DOUBLE)
if(WITH_TESTING)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
endif(WITH_TESTING)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
if("x${HCC_HOME}" STREQUAL "x")
set(HCC_HOME "/opt/rocm/hcc")
endif()
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
...@@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND) ...@@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND)
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir} DSTS ${dst_dir} ${dst_dir}
) )
elseif (WITH_MKLML)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
copy(mklml_lib
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
)
endif() endif()
# paddle fluid module # paddle fluid module
......
add_subdirectory(v2) add_subdirectory(v2)
add_subdirectory(fluid)
if(NOT DEFINED SPHINX_THEME)
set(SPHINX_THEME default)
endif()
if(NOT DEFINED SPHINX_THEME_DIR)
set(SPHINX_THEME_DIR)
endif()
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
# Sphinx cache with pickled ReST documents
set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
@ONLY)
sphinx_add_target(paddle_fluid_docs
html
${BINARY_BUILD_DIR_EN}
${SPHINX_CACHE_DIR_EN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
# Sphinx cache with pickled ReST documents
set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
# HTML output directory
set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
"${BINARY_BUILD_DIR_CN}/conf.py"
@ONLY)
sphinx_add_target(paddle_fluid_docs_cn
html
${BINARY_BUILD_DIR_CN}
${SPHINX_CACHE_DIR_CN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
Build and Install
------------
# go_op Design
## Introduction
The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
thread. It works in conjuction with CSP operators (channel_send,
channel_receive, channel_open, channel_close, and select) to allow users to
concurrently process data and communicate easily between different threads.
## How to use it
```
channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
with fluid.Go():
# Send a tensor of value 99 to "channel" on a detached thread
tensor = fill_constant(shape=[1], dtype='int', value=99)
tensor.stop_gradient = True
fluid.channel_send(channel, tensor)
# Receive sent tensor from "channel" on the main thread
result = fill_constant(shape=[1], dtype='int', value=-1)
fluid.channel_recv(ch, result)
```
The go operator can be accessed by using the fluid.Go() control flow. This
will create a new sub block, where the user can add additional operators
to be ran on the thread.
**Note:** Since back propegation is currently not support in the go_op, users
should ensure that operators in the go block does not require gradient
calculations.
## How it Works
Similar to other control blocks, go_op will create a sub block and add it
as a child to the current block. Operators and variables defined in this
block will be added to the go sub_block.
In addition, the go operator will create a new child scope whose parent is
the global scope. Please refer to [block captures](#block-captures) for more
information.
When Paddle executor runs go_op, go_op will take the sub_block and pass it to
the executor.run method (along with a newly created local scope) on a detached
thread.
An example of the generated program description is shown below. Take note of
the **go_op** in particular. It is added as an operator in the current
block (in this example, block0). The **go_op** contains a `sub_block`
attribute, which points to the id of the block that will be executed in a
detached thread.
```
blocks {
idx: 0
parent_idx: -1
vars {
name: "return_value"
type {
type: LOD_TENSOR
lod_tensor {
tensor {
data_type: INT64
}
}
}
}
vars {
name: "status_recv"
type {
type: LOD_TENSOR
lod_tensor {
tensor {
data_type: BOOL
}
}
}
}
...
ops {
outputs {
parameter: "Out"
arguments: "channel"
}
type: "channel_create"
attrs {
name: "data_type"
type: INT
i: 7
}
attrs {
name: "capacity"
type: INT
i: 0
}
}
ops {
inputs {
parameter: "X"
arguments: "channel"
}
type: "go"
attrs {
name: "sub_block"
type: BLOCK
block_idx: 1
}
}
ops {
inputs {
parameter: "Channel"
arguments: "channel"
}
outputs {
parameter: "Out"
arguments: "return_value"
}
outputs {
parameter: "Status"
arguments: "status_recv"
}
type: "channel_recv"
}
...
}
blocks {
idx: 1
parent_idx: 0
vars {
name: "status"
type {
type: LOD_TENSOR
lod_tensor {
tensor {
data_type: BOOL
}
}
}
}
...
ops {
outputs {
parameter: "Out"
arguments: "fill_constant_1.tmp_0"
}
type: "fill_constant"
attrs {
name: "force_cpu"
type: BOOLEAN
b: false
}
attrs {
name: "value"
type: FLOAT
f: 99.0
}
attrs {
name: "shape"
type: INTS
ints: 1
}
attrs {
name: "dtype"
type: INT
i: 3
}
}
ops {
inputs {
parameter: "Channel"
arguments: "channel"
}
inputs {
parameter: "X"
arguments: "fill_constant_1.tmp_0"
}
outputs {
parameter: "Status"
arguments: "status"
}
type: "channel_send"
attrs {
name: "copy"
type: BOOLEAN
b: false
}
}
```
## Current Limitations
#### <a name="block-captures"></a>Scopes and block captures:
Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
block. When a block is executed, a new local scope is created from the parent
scope (ie: scope derived from the parent block) and associated with the new
child block. After the block finishes executing, then the local scope and
all associated variables in the scope is deleted.
This works well in a single threaded scenario, however with introduction of
go_op, a child block may continue to execute even after the parent block has
exited. If the go_op tries to access variables located in the parent block's
scope, it may receive a segmentation fault because the parent scope may have
been deleted.
We need to implement block closures in order to prevent access to parent
scope variables from causing a segmentation fault. As a temporary workaround,
please ensure that all variables accessed in the go block is not destructed
before it is being accessed. Currently, the go_op will explicitly enforce
this requirement and raise an exception if a variable could not be found in
the scope.
Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
for more details.
#### Green Threads
Golang utilizes `green threads`, which is a mechnism for the runtime library to
manage multiple threads (instead of natively by the OS). Green threads usually
allows for faster thread creation and switching, as there is less overhead
when spawning these threads. For the first version of CSP, we only support
OS threads.
#### Backward Propegation:
go_op currently does not support backwards propagation. Please use go_op with
non training operators.
# select_op Design
## Introduction
In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
statement lets a goroutine wait on multiple communication operations at the
same time. The **select** blocks until one of its cases can run, then
executes the case. If multiple cases are ready to run, then one case is
choosen at random to be executed.
With the introduction of CSP for Paddle, we mimic this behavior by
creating a ***select_op***.
## How to use it
The **select_op** is available as a c++ operator. However most users
will prefer to use the much simplier Python API.
- **fluid.Select()**: Creates a select operator and adds it to the current
block within the main program. Also creates a sub block and adds it to the
main program. This sub block is used to hold all variables and operators
used by the case statements.
Within the select block, users can add cases by
calling **select.case** or **select.default** method.
- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
a fluid channel send/recv case. This method creates a SelectCase block
guard and adds it to the Select block. The arguments into this method tells
the select which channel operation to listen to.
- **fluid.Select.default()**: Represents the fluid default case. This default
case is executed if none of the channel send/recv cases are available to
execute.
**Example:**
```
ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
while_op = While(cond=while_cond)
with while_op.block():
with fluid.Select() as select:
with select.case(fluid.channel_send, channel, x):
# Send x, then perform Fibonacci calculation on x and y
x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
assign(input=x, output=x_tmp)
assign(input=y, output=x)
assign(elementwise_add(x=x_tmp, y=y), output=y)
with select.case(fluid.channel_recv, quit_channel, result2):
# Exit out of While loop
while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
helper = layer_helper.LayerHelper('assign')
helper.append_op(
type='assign',
inputs={'X': [while_false]},
outputs={'Out': [while_cond]})
```
## How it Works
### Program Description
```
blocks {
idx: 0
...
// Create "case_to_execute" variable
ops {
outputs {
parameter: "Out"
arguments: "fill_constant_110.tmp_0"
}
type: "fill_constant"
attrs {
name: "force_cpu"
type: BOOLEAN
b: false
}
attrs {
name: "value"
type: FLOAT
f: -1.0
}
attrs {
name: "shape"
type: INTS
ints: 1
}
attrs {
name: "dtype"
type: INT
i: 2
}
}
// Create "select" operator.
// inputs:
// X: All input variables used by operators within the select block
// case_to_execute: Variable filled in by select_op when it determines
// which case to execute.
//
// outputs:
// Out: All output variables referenced by operators within select block.
//
// attrs:
// sub_block: The block id containing the select "cases"
// cases: Serialized list of all cases in the select op.
// Each case is serialized as: '<index>,<type>,<channel>,<value>'
// where type is 0 for default, 1 for send, and 2 for receive.
// No channel and values are needed for default cases.
ops {
inputs {
parameter: "X"
arguments: "fill_constant_103.tmp_0"
arguments: "fill_constant_104.tmp_0"
}
inputs {
parameter: "case_to_execute"
arguments: "fill_constant_110.tmp_0"
}
outputs {
parameter: "Out"
arguments: "fill_constant_110.tmp_0"
}
type: "select"
attrs {
name: "sub_block"
type: BLOCK
block_idx: 1
}
attrs {
name: "cases"
type: STRINGS
strings: "0,1,channel_101,fill_constant_109.tmp_0"
strings: "1,2,channel_102,fill_constant_108.tmp_0"
}
}
...
}
```
The python select API will add the **select_op** to the current block. In addition, it will
iterate through all it's case statements and add any input variables required by case statements
into **X**. It will also create a temp variable called **case_to_execute**. This variable is
filled in by the select_op after it has completed processing the case statements.
If there are no available cases to execute (ie: all cases are blocked on channel operations, and
there is no default statement), then the select_op will block the current thread. The thread will
unblock once there is a channel operation affecting one of the case statements, at which point, the
**select_op** will set the **case_to_execute** variable to the index of the case to execute.
Finally the select_op will call executor.run on the **sub_block**.
```
blocks {
idx: 1
parent_idx: 0
...
// Fill a tensor with the case index (ie: 0,1,2,3,ect.)
ops {
outputs {
parameter: "Out"
arguments: "fill_constant_111.tmp_0"
}
type: "fill_constant"
attrs {
name: "force_cpu"
type: BOOLEAN
b: false
}
attrs {
name: "value"
type: FLOAT
f: 0.0
}
attrs {
name: "shape"
type: INTS
ints: 1
}
attrs {
name: "dtype"
type: INT
i: 2
}
}
// Create an "equal" operator to compare the case index with the "case_to_execute"
// tensor (which was filled in by the select op).
ops {
inputs {
parameter: "X"
arguments: "fill_constant_111.tmp_0" // case 0
}
inputs {
parameter: "Y"
arguments: "fill_constant_110.tmp_0" // case_to_execute
}
outputs {
parameter: "Out"
arguments: "equal_0.tmp_0"
}
type: "equal"
attrs {
name: "axis"
type: INT
i: -1
}
}
// Use the output of the "equal" operator as a condition for the "conditional_block".
// If the condition evaluates to true, then execute the "sub_block" (which represents
// the select case's body)
ops {
inputs {
parameter: "Params"
}
inputs {
parameter: "X"
arguments: "equal_0.tmp_0"
}
outputs {
parameter: "Out"
}
outputs {
parameter: "Scope"
arguments: "_generated_var_0"
}
type: "conditional_block"
attrs {
name: "is_scalar_condition"
type: BOOLEAN
b: true
}
attrs {
name: "sub_block"
type: BLOCK
block_idx: 4
}
}
...
// Repeat the above operators for each case statements inside the select body
}
```
Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
equal(**case_to_execute**, **case_index**). Since each case index is unique in this sub-block,
only one case will be executed.
### select_op flow
<p align="center">
<img src="./images/select_op_workflow.png"/><br/>
</p>
The select algorithm is inspired by golang's select routine. Please refer to
http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
## Backward Pass
TODO
...@@ -26,7 +26,7 @@ lookup of rows. ...@@ -26,7 +26,7 @@ lookup of rows.
The following figure illustrates the multiplication of x with two The following figure illustrates the multiplication of x with two
non-zero elements, or say, two symbols, and a lookup table W: non-zero elements, or say, two symbols, and a lookup table W:
![lookup table](./lookup_table.png) ![lookup table](./src/lookup_table.png)
### The Backward Algorithm ### The Backward Algorithm
...@@ -42,7 +42,7 @@ or some more sophisticated algorithms that rely on both W' and W: ...@@ -42,7 +42,7 @@ or some more sophisticated algorithms that rely on both W' and W:
$$W = f(W, W')$$ $$W = f(W, W')$$
The following figure illustrates the backward pass of the lookup The following figure illustrates the backward pass of the lookup
operator: ![lookup table training](./lookup_table_training.png) operator: ![lookup table training](./src/lookup_table_training.png)
## Distributed Storage Service ## Distributed Storage Service
......
...@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i ...@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
## RNN Algorithm Implementation ## RNN Algorithm Implementation
<p align="center"> <p align="center">
<img src="./images/rnn.jpg"/> <img src="./rnn.jpg"/>
</p> </p>
The above diagram shows an RNN unrolled into a full network. The above diagram shows an RNN unrolled into a full network.
...@@ -22,7 +22,7 @@ There are several important concepts here: ...@@ -22,7 +22,7 @@ There are several important concepts here:
There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step. There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
<p align="center"> <p align="center">
<img src="./images/rnn.png"/><br/> <img src="./rnn.png"/><br/>
Figure 2 illustrates the RNN's data flow Figure 2 illustrates the RNN's data flow
</p> </p>
...@@ -49,7 +49,7 @@ or copy the memory value of the previous step to the current ex-memory variable. ...@@ -49,7 +49,7 @@ or copy the memory value of the previous step to the current ex-memory variable.
### Usage in Python ### Usage in Python
For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
We can define an RNN's step-net using a Block: We can define an RNN's step-net using a Block:
...@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par ...@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text. The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
<p align="center"> <p align="center">
<img src="./images/2_level_rnn.png"/> <img src="./2_level_rnn.png"/>
</p> </p>
```python ```python
...@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st ...@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
<p align="center"> <p align="center">
<img src="images/rnn_2level_data.png"/> <img src="./rnn_2level_data.png"/>
</p> </p>
设计思想
------------
...@@ -103,7 +103,7 @@ In computability theory, a system of data-manipulation rules, such as a programm ...@@ -103,7 +103,7 @@ In computability theory, a system of data-manipulation rules, such as a programm
There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree). There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program. There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md). Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
......
开发标准
------------
Development
------------
This is Development page
新手入门
------------
新手入门
GET STARTED
------------
This is get started page
# Fluid 分布式版本使用指南
本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行,以及将单机训练脚本改造成支持集群训练的版本
## 准备工作
* 可用的集群
包含一个或多个计算节点的集群,每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址,集群内的所有计算节点可以通过网络相互通信。
* 安装PaddlePaddle Fluid with Distribution版本
所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
**注意:**当前对外提供的PaddlePaddle版本并不支持分布式,需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
cmake编译命令中需要将WITH_DISTRIBUTE设置为ON,下面是一个cmake编译指令示例:
``` bash
cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
```
## 更新训练脚本
这里,我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例,描述如何将单机训练脚本改造成支持集群训练的版本。
### 单机训练脚本示例
```python
import paddle.v2 as paddle
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
BATCH_SIZE = 20
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
PASS_NUM = 100
for pass_id in range(PASS_NUM):
fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader():
avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
if avg_loss_value[0] < 10.0:
exit(0) # if avg cost less than 10.0, we think our code is good.
exit(1)
```
我们创建了一个简单的全连接神经网络程序,并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
### 介绍Parameter Server
在非分布式版本的训练脚本中,只存在Trainer一种角色,它不仅处理常规的计算任务,也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中,由于存在多个Trainer节点进行同样的数据计算任务,因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中,我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
**因此,在分布式的Fluid环境中,我们有两个角色需要创建,分别是Parameter Server和Trainer。**
### 分布式训练
Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数,将他们分隔为两部分,通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
```python
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
```
将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下:
```python
... #define the program, cost, and create sgd optimizer
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
t = fluid.DistributeTranspiler() # create the transpiler instance
# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
... #create executor
# in pserver, run this
#current_endpoint here means current pserver IP:PORT you wish to run on
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
# in trainer, run this
... # define data reader
exe.run(fluid.default_startup_program())
for pass_id in range(100):
for data in train_reader():
exe.run(t.get_trainer_program())
```
### 分布式训练脚本运行说明
分布式任务的运行需要将表格中说明的多个参数进行赋值:
| 参数名 | 值类型 | 说明 | 示例 |
|:-------------|:------|:---------------------------------------|:-------------|
| trainer_id | int | 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 | 0/1/2/3 |
| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
| trainers | int | 训练节点的总个数,>0的数字 | 4 |
| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
| training_role | str | 节点角色, TRAINER/PSERVER | PSERVER |
**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下:
```python
t = fluid.DistributeTranspiler()
t.transpile(
optimize_ops,
params_grads,
trainer_id,
pservers=pserver,
trainers=trainers)
if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(server_endpoint)
pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
```
### Demo
完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
第一步,进入demo代码所在目录:
```bash
cd /paddle/python/paddle/fluid/tests/book
```
第二步,启动Parameter Server:
```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
```
执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
第三步,启动Trainer:
```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
```
由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
进阶使用
------------
HOW TO
------------
This is how to page
PaddlePaddle Fluid
==========================
.. toctree::
:maxdepth: 1
getstarted/index_cn.rst
design/index_cn.rst
build_and_install/index_cn.rst
howto/index_cn.rst
dev/index_cn.rst
faq/index_cn.rst
PaddlePaddle Fluid
==========================
.. toctree::
:maxdepth: 1
getstarted/index_en.rst
design/index_en.rst
build_and_install/index_en.rst
howto/index_en.rst
dev/index_en.rst
faq/index_en.rst
...@@ -47,3 +47,10 @@ DecayedAdagrad ...@@ -47,3 +47,10 @@ DecayedAdagrad
:members: :members:
:noindex: :noindex:
Adadelta
--------------
.. autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
:members:
:noindex:
Development Development
------------ ------------
PaddlePaddle adheres to the following three sections of code and document specifications.
PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages,which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
contribute_to_paddle_en.md contribute_to_paddle_en.md
PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
.. toctree::
:maxdepth: 1
write_docs_en.rst write_docs_en.rst
PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
.. toctree::
:maxdepth: 1
new_layer_en.rst new_layer_en.rst
...@@ -2,13 +2,14 @@ ...@@ -2,13 +2,14 @@
如何贡献文档 如何贡献文档
############# #############
PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成,也可以利用paddlepaddle.org工具来编译和预览文档 PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的,PaddlePaddle.org工具可以帮助我们实现这一编译过程,并提供更好的预览效果
如何构建文档 如何构建文档
============ ============
PaddlePaddle的文档构建有两种方式,分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具,两种方式都有各自的优点,前者方便预览,后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。 PaddlePaddle的文档构建有两种方式,分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具,两种方式都有各自的优点,前者方便预览,后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
我们建议使用PaddlePaddle.org工具来构建文档。
使用PaddlePaddle.org工具 使用PaddlePaddle.org工具
------------------------ ------------------------
...@@ -31,7 +32,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D ...@@ -31,7 +32,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D
docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令 注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
如果不想使用Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 如果不想使用Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
...@@ -56,7 +57,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D ...@@ -56,7 +57,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D
python manage.py runserver python manage.py runserver
工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。 工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。 之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。 想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
...@@ -96,7 +97,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D ...@@ -96,7 +97,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D
python -m SimpleHTTPServer 8088 python -m SimpleHTTPServer 8088
在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。 在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。
.. image:: src/doc_en.png .. image:: src/doc_en.png
:align: center :align: center
......
...@@ -2,21 +2,20 @@ ...@@ -2,21 +2,20 @@
Contribute Documentation Contribute Documentation
######################## ########################
PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``. PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results.
Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
How to Build Documentations How to build Documentation
============ ===========================
We recommend using PaddlePaddle.org tool to build documentation PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways.
We recommend using PaddlePaddle.org tool to build documentation.
Use PaddlePaddle.org tool Using PaddlePaddle.org tool
-------------- -----------------------------
This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser. This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style.
The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool
.. code-block:: bash .. code-block:: bash
...@@ -32,8 +31,8 @@ The tool uses Docker, please install it on your system. Please check Docker offi ...@@ -32,8 +31,8 @@ The tool uses Docker, please install it on your system. Please check Docker offi
# Please specify the working directory through -v # Please specify the working directory through -v
docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands
Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation.
The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
...@@ -58,19 +57,62 @@ If you don't wish to use Docker, you can also activate the tool through Django. ...@@ -58,19 +57,62 @@ If you don't wish to use Docker, you can also activate the tool through Django.
pip install -r requirements.txt pip install -r requirements.txt
python manage.py runserver python manage.py runserver
Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is.
Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation
The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。 Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ for more information about the PaddlePaddle.org tool.
Manually Building the Documentation
-------------------------------------
Build PaddlePaddle's documentation with Docker,you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
[TBD]
If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
.. code-block:: bash
mkdir paddle
cd paddle
git clone https://github.com/PaddlePaddle/Paddle.git
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
# If you only need to build documents, use the following commands
make -j $processors gen_proto_py
make -j $processors paddle_docs paddle_docs_cn
# If you only need to build APIs, use the following commands
make -j $processors gen_proto_py framework_py_proto
make -j $processors copy_paddle_pybind
make -j $processors paddle_api_docs
$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs,it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
.. code-block:: bash
python -m SimpleHTTPServer 8088
Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
How to write Documentations .. image:: src/doc_en.png
============ :align: center
:scale: 60 %
PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail. How to write Documentation
===========================
PaddlePaddle uses `sphinx`_ to compile documentation,Please check sphinx official website for more detail.
How to update www.paddlepaddle.org How to update www.paddlepaddle.org
============================ ===================================
Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。 Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
......
FAQ FAQ
==== ====
This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community <https://github.com/PaddlePaddle/Paddle/issues>`_ , to find an answer or submit new `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_ , we will reply in time.
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
......
GET STARTED GET STARTED
============ ============
If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
quickstart_en.rst quickstart_en.rst
While using PaddlePaddle to build applications, please understand some basic concepts.
Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
.. toctree::
:maxdepth: 1
concepts/use_concepts_en.rst concepts/use_concepts_en.rst
C-API Prediction Library C-API Inference Library
======================== ========================
After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result.
Compared with model training, prediction has the following features:
#. Inference does not require backpropagation and parameter updates, as required during training.
#. Labels are not needed in prediction.
#. Most of the time, predictions need to be integrated with the user system.
Therefore, the model prediction SDK needs to be designed separately and has the following features:
#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK.
#. The predictive SDK needs a simple user interface for ease of use.
#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged.
#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface.
PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
......
...@@ -2,6 +2,9 @@ Distributed Training ...@@ -2,6 +2,9 @@ Distributed Training
==================== ====================
The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model. The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model.
Distributed training generally has framwork as shown below:
.. image:: src/ps_en.png .. image:: src/ps_en.png
:width: 500 :width: 500
......
HOW TO HOW TO
======= ========
PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
cmd_parameter/index_en.rst cmd_parameter/index_en.rst
PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
.. toctree::
:maxdepth: 1
cluster/index_en.rst cluster/index_en.rst
PaddlePaddle provides a C-API for inference. We provide the following guidelines for using the C-API:
.. toctree::
:maxdepth: 1
capi/index_en.rst capi/index_en.rst
PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to:
.. toctree::
:maxdepth: 1
rnn/index_en.rst rnn/index_en.rst
How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to:
.. toctree::
:maxdepth: 1
optimization/gpu_profiling_en.rst optimization/gpu_profiling_en.rst
...@@ -34,7 +34,7 @@ class Channel { ...@@ -34,7 +34,7 @@ class Channel {
public: public:
virtual bool CanSend() = 0; virtual bool CanSend() = 0;
virtual bool CanReceive() = 0; virtual bool CanReceive() = 0;
virtual bool Send(T*) = 0; virtual void Send(T*) = 0;
virtual bool Receive(T*) = 0; virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0; virtual size_t Cap() = 0;
virtual void Lock() = 0; virtual void Lock() = 0;
...@@ -84,69 +84,81 @@ class ChannelHolder { ...@@ -84,69 +84,81 @@ class ChannelHolder {
} }
template <typename T> template <typename T>
bool Send(T* data) { void Send(T* data) {
if (!IsInitialized()) return false; PADDLE_ENFORCE_EQ(IsInitialized(), true,
PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T))); "The Channel hasn't been initialized");
PADDLE_ENFORCE_EQ(
holder_->Type(), std::type_index(typeid(T)),
"Channel type is not same as the type of the data being sent");
// Static cast should be safe because we have ensured that types are same // Static cast should be safe because we have ensured that types are same
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
return channel != nullptr ? channel->Send(data) : false; PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
channel->Send(data);
} }
template <typename T> template <typename T>
bool Receive(T* data) { bool Receive(T* data) {
if (!IsInitialized()) return false; PADDLE_ENFORCE_EQ(IsInitialized(), true,
PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T))); "The Channel hasn't been initialized");
PADDLE_ENFORCE_EQ(
holder_->Type(), std::type_index(typeid(T)),
"Channel type is not same as the type of the data being sent");
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
return channel != nullptr ? channel->Receive(data) : false; PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
return channel->Receive(data);
} }
bool IsClosed() { bool IsClosed() {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
return holder_->IsClosed(); "The Channel hasn't been initialized");
} return holder_->IsClosed();
return false;
} }
bool CanSend() { bool CanSend() {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
return holder_->CanSend(); "The Channel hasn't been initialized");
} return holder_->CanSend();
return false;
} }
bool CanReceive() { bool CanReceive() {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
return holder_->CanReceive(); "The Channel hasn't been initialized");
} return holder_->CanReceive();
return false;
} }
void close() { void close() {
if (IsInitialized()) holder_->Close(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Close();
} }
size_t Cap() { size_t Cap() {
if (IsInitialized()) return holder_->Cap(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
return -1; "The Channel hasn't been initialized");
return holder_->Cap();
} }
void Lock() { void Lock() {
if (IsInitialized()) holder_->Lock(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Lock();
} }
void Unlock() { void Unlock() {
if (IsInitialized()) holder_->Unlock(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Unlock();
} }
template <typename T> template <typename T>
void AddToSendQ(const void* referrer, T* data, void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond, std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) { std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); "The Channel hasn't been initialized");
if (channel != nullptr) { Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
channel->AddToSendQ(referrer, data, cond, cb); if (channel != nullptr) {
} channel->AddToSendQ(referrer, data, cond, cb);
} }
} }
...@@ -154,26 +166,31 @@ class ChannelHolder { ...@@ -154,26 +166,31 @@ class ChannelHolder {
void AddToReceiveQ(const void* referrer, T* data, void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond, std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) { std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); "The Channel hasn't been initialized");
if (channel != nullptr) { Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
channel->AddToReceiveQ(referrer, data, cond, cb); if (channel != nullptr) {
} channel->AddToReceiveQ(referrer, data, cond, cb);
} }
} }
void RemoveFromSendQ(const void* referrer) { void RemoveFromSendQ(const void* referrer) {
if (IsInitialized()) holder_->RemoveFromSendQ(referrer); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->RemoveFromSendQ(referrer);
} }
void RemoveFromReceiveQ(const void* referrer) { void RemoveFromReceiveQ(const void* referrer) {
if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->RemoveFromReceiveQ(referrer);
} }
inline bool IsInitialized() const { return holder_ != nullptr; } inline bool IsInitialized() const { return holder_ != nullptr; }
inline const std::type_index Type() { inline const std::type_index Type() {
PADDLE_ENFORCE_EQ(IsInitialized(), true); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->Type(); return holder_->Type();
} }
......
...@@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
public: public:
virtual bool CanSend(); virtual bool CanSend();
virtual bool CanReceive(); virtual bool CanReceive();
virtual bool Send(T *); virtual void Send(T *);
virtual bool Receive(T *); virtual bool Receive(T *);
virtual size_t Cap() { return cap_; } virtual size_t Cap() { return cap_; }
virtual void Lock(); virtual void Lock();
...@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel<T> {
} }
}; };
bool send_return(bool value) { void send_return() {
send_ctr--; send_ctr--;
destructor_cond_.notify_all(); destructor_cond_.notify_all();
return value;
} }
bool recv_return(bool value) { bool recv_return(bool value) {
...@@ -118,15 +117,15 @@ bool ChannelImpl<T>::CanReceive() { ...@@ -118,15 +117,15 @@ bool ChannelImpl<T>::CanReceive() {
} }
template <typename T> template <typename T>
bool ChannelImpl<T>::Send(T *item) { void ChannelImpl<T>::Send(T *item) {
send_ctr++; send_ctr++;
std::unique_lock<std::recursive_mutex> lock{mu_}; std::unique_lock<std::recursive_mutex> lock{mu_};
// If channel is closed, do nothing // If channel is closed, throw exception
if (closed_) { if (closed_) {
lock.unlock(); lock.unlock();
// TODO(abhinavarora) Should panic on closed channel send_return();
return send_return(false); PADDLE_THROW("Cannot send on closed channel");
} }
// If there is a receiver, directly pass the value we want // If there is a receiver, directly pass the value we want
...@@ -143,7 +142,7 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -143,7 +142,7 @@ bool ChannelImpl<T>::Send(T *item) {
if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND); if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
if (do_send) if (do_send)
*(m->data) = std::move(*item); *(m->data) = std::move(*item);
else else {
// We cannot do the data transfer because // We cannot do the data transfer because
// this QueueMessage was added by Select // this QueueMessage was added by Select
// and some other case was executed. // and some other case was executed.
...@@ -151,12 +150,17 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -151,12 +150,17 @@ bool ChannelImpl<T>::Send(T *item) {
// We do not care about notifying other // We do not care about notifying other
// because they would have been notified // because they would have been notified
// by the executed select case. // by the executed select case.
return send_return(Send(item)); lock.unlock();
Send(item);
send_return();
return;
}
// Wake up the blocked process and unlock // Wake up the blocked process and unlock
m->Notify(); m->Notify();
lock.unlock(); lock.unlock();
return send_return(true); send_return();
return;
} }
// Unbuffered channel will always bypass this // Unbuffered channel will always bypass this
...@@ -167,7 +171,8 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -167,7 +171,8 @@ bool ChannelImpl<T>::Send(T *item) {
buf_.push_back(std::move(*item)); buf_.push_back(std::move(*item));
// Release lock and return true // Release lock and return true
lock.unlock(); lock.unlock();
return send_return(true); send_return();
return;
} }
// Block on channel, because some receiver will complete // Block on channel, because some receiver will complete
...@@ -175,8 +180,12 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -175,8 +180,12 @@ bool ChannelImpl<T>::Send(T *item) {
auto m = std::make_shared<QueueMessage>(item); auto m = std::make_shared<QueueMessage>(item);
sendq.push_back(m); sendq.push_back(m);
m->Wait(lock); m->Wait(lock);
// TODO(abhinavarora) Should panic on closed channel if (m->chan_closed) {
return send_return(!m->chan_closed); lock.unlock();
send_return();
PADDLE_THROW("Cannot send on closed channel");
}
send_return();
} }
template <typename T> template <typename T>
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <chrono> #include <chrono>
#include <thread> #include <thread>
#include "gtest/gtest.h" #include "gtest/gtest.h"
using paddle::framework::Channel; using paddle::framework::Channel;
...@@ -41,7 +40,7 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) { ...@@ -41,7 +40,7 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
unsigned sum_send = 0; unsigned sum_send = 0;
std::thread t([&]() { std::thread t([&]() {
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true); ch->Send(&i);
sum_send += i; sum_send += i;
} }
}); });
...@@ -61,7 +60,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) { ...@@ -61,7 +60,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10; const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size); auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) { for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Send(&i), true); // should not block ch->Send(&i);
} }
size_t out; size_t out;
...@@ -82,7 +81,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) { ...@@ -82,7 +81,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
const size_t data = 5; const size_t data = 5;
std::thread send_thread{[&]() { std::thread send_thread{[&]() {
size_t i = data; size_t i = data;
EXPECT_EQ(ch->Send(&i), true); // should not block ch->Send(&i); // should not block
}}; }};
std::thread recv_thread{[&]() { std::thread recv_thread{[&]() {
...@@ -94,12 +93,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) { ...@@ -94,12 +93,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
send_thread.join(); send_thread.join();
recv_thread.join(); recv_thread.join();
// After closing send should return false. Receive should // After closing send should panic. Receive should
// also return false as there is no data in queue. // also false as there is no data in queue.
CloseChannel(ch); CloseChannel(ch);
send_thread = std::thread{[&]() { send_thread = std::thread{[&]() {
size_t i = data; size_t i = data;
EXPECT_EQ(ch->Send(&i), false); // should return false bool is_exception = false;
try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
}}; }};
recv_thread = std::thread{[&]() { recv_thread = std::thread{[&]() {
size_t i; size_t i;
...@@ -129,7 +134,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { ...@@ -129,7 +134,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
auto ch = MakeChannel<size_t>(buffer_size); auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) { for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Send(&i), true); // sending should not block ch->Send(&i); // sending should not block
} }
size_t out; size_t out;
...@@ -160,9 +165,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -160,9 +165,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
// Try to write more than buffer size. // Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) { for (size_t i = 0; i < 2 * buffer_size; ++i) {
if (i < buffer_size) if (i < buffer_size)
EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations ch->Send(&i); // should block after 10 iterations
else else {
EXPECT_EQ(ch->Send(&i), false); bool is_exception = false;
try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
}
} }
}); });
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
...@@ -231,7 +243,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) { ...@@ -231,7 +243,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
...@@ -316,8 +334,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) { ...@@ -316,8 +334,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
// Try to send more number of times // Try to send more number of times
// than receivers // than receivers
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
ch->Send(&i); try {
sum_send += i; ch->Send(&i);
sum_send += i;
} catch (paddle::platform::EnforceNotMet e) {
}
} }
}); });
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
...@@ -382,7 +403,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) { ...@@ -382,7 +403,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
...@@ -508,7 +535,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) { ...@@ -508,7 +535,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) {
unsigned sum_send = 0; unsigned sum_send = 0;
std::thread t([&]() { std::thread t([&]() {
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true); ch->Send(&i);
sum_send += i; sum_send += i;
} }
}); });
...@@ -541,8 +568,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) { ...@@ -541,8 +568,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) {
ChannelHolder *ch = new ChannelHolder(); ChannelHolder *ch = new ChannelHolder();
EXPECT_EQ(ch->IsInitialized(), false); EXPECT_EQ(ch->IsInitialized(), false);
int i = 10; int i = 10;
EXPECT_EQ(ch->Send(&i), false); bool send_exception = false;
EXPECT_EQ(ch->Receive(&i), false); try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
send_exception = true;
}
EXPECT_EQ(send_exception, true);
bool recv_exception = false;
try {
ch->Receive(&i);
} catch (paddle::platform::EnforceNotMet e) {
recv_exception = true;
}
EXPECT_EQ(recv_exception, true);
bool is_exception = false; bool is_exception = false;
try { try {
ch->Type(); ch->Type();
...@@ -669,7 +710,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { ...@@ -669,7 +710,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
...@@ -760,7 +807,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { ...@@ -760,7 +807,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
......
...@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program, ...@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
// Select block // Select block
AddOp("select", {{"X", {dataChanName, quitChanName}}, AddOp("select", {{"X", {dataChanName, quitChanName}},
{"case_to_execute", {"caseToExecute"}}}, {"case_to_execute", {"caseToExecute"}}},
{}, {{"sub_block", casesBlock}, {{"Out", {}}},
{"cases", std::vector<std::string>{case0Config, case1Config}}}, {{"sub_block", casesBlock},
{"cases", std::vector<std::string>{case0Config, case1Config}}},
whileBlock); whileBlock);
scope->Var("stepScopes"); scope->Var("stepScopes");
...@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) { ...@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) {
executor.Run(program, &scope, 0, true, true); executor.Run(program, &scope, 0, true, true);
// After we call executor.run, the Go operator should do a channel_send to set // After we call executor.run, the Go operator should do a channel_send to
// the // set the "result" variable to 99.
// "result" variable to 99
auto *finalData = tensor.data<int>(); auto *finalData = tensor.data<int>();
EXPECT_EQ(finalData[0], 99); EXPECT_EQ(finalData[0], 99);
} }
......
...@@ -14,12 +14,8 @@ limitations under the License. */ ...@@ -14,12 +14,8 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include <set>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -40,14 +36,13 @@ namespace { ...@@ -40,14 +36,13 @@ namespace {
int kProgramId = -1; int kProgramId = -1;
} // namespace } // namespace
struct ExecutorPrepareContext { ExecutorPrepareContext::ExecutorPrepareContext(
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id) const framework::ProgramDesc& prog, size_t block_id)
: prog_(prog), block_id_(block_id) {} : prog_(prog), block_id_(block_id) {}
const framework::ProgramDesc prog_; ExecutorPrepareContext::~ExecutorPrepareContext() {
size_t block_id_; VLOG(5) << "destroy ExecutorPrepareContext";
std::vector<std::unique_ptr<OperatorBase>> ops_; }
};
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
...@@ -101,9 +96,8 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -101,9 +96,8 @@ static void CheckTensorNANOrInf(const std::string& name,
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
auto* ctx = Prepare(pdesc, block_id); auto ctx = Prepare(pdesc, block_id);
RunPreparedContext(ctx, scope, create_local_scope, create_vars); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
delete ctx;
} }
// Check whether the block already has feed operators and feed_holder. // Check whether the block already has feed operators and feed_holder.
...@@ -191,7 +185,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -191,7 +185,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets, std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name, const std::string& feed_holder_name,
const std::string& fetch_holder_name) { const std::string& fetch_holder_name, bool create_vars) {
platform::RecordBlock b(kProgramId); platform::RecordBlock b(kProgramId);
bool has_feed_ops = bool has_feed_ops =
has_feed_operators(program.Block(0), feed_targets, feed_holder_name); has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
...@@ -261,7 +255,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -261,7 +255,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
} }
} }
Run(*copy_program, scope, 0, true, true); Run(*copy_program, scope, 0, create_vars, create_vars);
// obtain the data of fetch_targets from fetch_holder // obtain the data of fetch_targets from fetch_holder
for (auto* op : global_block->AllOps()) { for (auto* op : global_block->AllOps()) {
...@@ -274,15 +268,15 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -274,15 +268,15 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
} }
} }
ExecutorPrepareContext* Executor::Prepare(const ProgramDesc& program, std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
int block_id) { const ProgramDesc& program, int block_id) {
auto* ctx = new ExecutorPrepareContext(program, block_id); auto* ctx = new ExecutorPrepareContext(program, block_id);
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size()); PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
auto& block = program.Block(block_id); auto& block = program.Block(block_id);
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
return ctx; return std::unique_ptr<ExecutorPrepareContext>(ctx);
} }
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
......
...@@ -22,7 +22,16 @@ limitations under the License. */ ...@@ -22,7 +22,16 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
struct ExecutorPrepareContext;
struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
~ExecutorPrepareContext();
const framework::ProgramDesc& prog_;
size_t block_id_;
std::vector<std::unique_ptr<OperatorBase>> ops_;
};
class Executor { class Executor {
public: public:
// TODO(dzhwinter) : Do not rely on this function, it will be removed // TODO(dzhwinter) : Do not rely on this function, it will be removed
...@@ -45,10 +54,11 @@ class Executor { ...@@ -45,10 +54,11 @@ class Executor {
std::map<std::string, const LoDTensor*>& feed_targets, std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name = "feed", const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch"); const std::string& fetch_holder_name = "fetch",
bool create_vars = true);
static ExecutorPrepareContext* Prepare(const ProgramDesc& program, static std::unique_ptr<ExecutorPrepareContext> Prepare(
int block_id); const ProgramDesc& program, int block_id);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true, bool create_local_scope = true,
......
...@@ -176,7 +176,7 @@ class Vector { ...@@ -176,7 +176,7 @@ class Vector {
// resize the vector // resize the vector
void resize(size_t size) { void resize(size_t size) {
if (size + 1 < capacity()) { if (size + 1 <= capacity()) {
size_ = size; size_ = size;
} else { } else {
MutableCPU(); MutableCPU();
......
...@@ -104,3 +104,11 @@ TEST(mixed_vector, ForEach) { ...@@ -104,3 +104,11 @@ TEST(mixed_vector, ForEach) {
for (auto& v : tmp) { for (auto& v : tmp) {
} }
} }
TEST(mixed_vector, Reserve) {
paddle::framework::Vector<int> vec;
vec.reserve(1);
vec.push_back(0);
vec.push_back(0);
vec.push_back(0);
}
...@@ -45,10 +45,11 @@ class Tensor { ...@@ -45,10 +45,11 @@ class Tensor {
friend struct EigenVector; friend struct EigenVector;
public: public:
Tensor() : offset_(0) {} Tensor() : offset_(0), is_pinned_(false) {}
/*! Constructor with place should only be used in pybind. */ /*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) { explicit Tensor(const platform::Place& place)
: offset_(0), is_pinned_(false) {
holder_->set_place(place); holder_->set_place(place);
} }
...@@ -69,11 +70,12 @@ class Tensor { ...@@ -69,11 +70,12 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
inline T* mutable_data(platform::Place place); inline T* mutable_data(platform::Place place, bool is_pinned = false);
inline void* mutable_data(platform::Place place, std::type_index type); inline void* mutable_data(platform::Place place, std::type_index type,
bool is_pinned = false);
inline void* mutable_data(platform::Place place); inline void* mutable_data(platform::Place place, bool is_pinned = false);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
...@@ -84,7 +86,8 @@ class Tensor { ...@@ -84,7 +86,8 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
inline T* mutable_data(DDim dims, platform::Place place); inline T* mutable_data(DDim dims, platform::Place place,
bool is_pinned = false);
/*! Return the dimensions of the memory block. */ /*! Return the dimensions of the memory block. */
inline const DDim& dims() const; inline const DDim& dims() const;
...@@ -92,6 +95,9 @@ class Tensor { ...@@ -92,6 +95,9 @@ class Tensor {
/*! Return the numel of the memory block. */ /*! Return the numel of the memory block. */
inline int64_t numel() const; inline int64_t numel() const;
/*! Return the numel of the memory block. */
inline bool isPinned() const;
/*! Resize the dimensions of the memory block. */ /*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims); inline Tensor& Resize(const DDim& dims);
...@@ -146,12 +152,14 @@ class Tensor { ...@@ -146,12 +152,14 @@ class Tensor {
template <typename Place> template <typename Place>
struct PlaceholderImpl : public Placeholder { struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type) PlaceholderImpl(Place place, size_t size, std::type_index type,
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)), bool is_pinned = false)
memory::PODDeleter<uint8_t, Place>(place)), : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
place_(place), place_(place),
size_(size), size_(size),
type_(type) { type_(type),
is_pinned_(is_pinned) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU")); (is_cpu_place(place_) ? "CPU" : "GPU"));
} }
...@@ -174,6 +182,9 @@ class Tensor { ...@@ -174,6 +182,9 @@ class Tensor {
/* the current type of memory */ /* the current type of memory */
std::type_index type_; std::type_index type_;
/*! use pinned memory or not. */
bool is_pinned_;
}; };
/*! holds the memory block if allocated. */ /*! holds the memory block if allocated. */
...@@ -208,6 +219,7 @@ class Tensor { ...@@ -208,6 +219,7 @@ class Tensor {
* PlaceHolder::ptr_ and where the tensor data really begins. * PlaceHolder::ptr_ and where the tensor data really begins.
*/ */
size_t offset_; size_t offset_;
bool is_pinned_;
}; };
inline void Tensor::switch_place(platform::Place new_place) { inline void Tensor::switch_place(platform::Place new_place) {
......
...@@ -101,52 +101,55 @@ inline T* Tensor::data() { ...@@ -101,52 +101,55 @@ inline T* Tensor::data() {
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place) { inline T* Tensor::mutable_data(DDim dims, platform::Place place,
bool is_pinned) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims); Resize(dims);
return mutable_data<T>(place); return mutable_data<T>(place, is_pinned);
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(platform::Place place) { inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T))); return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
} }
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
bool is_pinned) {
if (holder_ != nullptr) { if (holder_ != nullptr) {
holder_->set_type(type); holder_->set_type(type);
} }
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GE(numel(), 0,
numel(), 0, "When calling this method, the Tensor's numel must be "
"When calling this method, the Tensor's numel must be larger than zero. " "equal or larger than zero. "
"Please check Tensor::Resize has been called first."); "Please check Tensor::Resize has been called first.");
int64_t size = numel() * SizeOfType(type); int64_t size = numel() * SizeOfType(type);
/* some versions of boost::variant don't have operator!= */ /* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) { holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CPUPlace>( holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type)); boost::get<platform::CPUPlace>(place), size, type, is_pinned));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
} }
#else #else
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>( holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type)); boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
} }
#endif #endif
offset_ = 0; offset_ = 0;
is_pinned_ = is_pinned;
} }
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
inline void* Tensor::mutable_data(platform::Place place) { inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
PADDLE_ENFORCE(this->holder_ != nullptr, PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing"); "Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type()); return mutable_data(place, holder_->type(), is_pinned);
} }
inline Tensor& Tensor::ShareDataWith(const Tensor& src) { inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
...@@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; } ...@@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); } inline int64_t Tensor::numel() const { return product(dims_); }
inline bool Tensor::isPinned() const { return is_pinned_; }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res; Tensor res;
res.ShareDataWith(src); res.ShareDataWith(src);
......
...@@ -119,6 +119,50 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -119,6 +119,50 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
bool GPUAllocator::UseGpu() const { return true; } bool GPUAllocator::UseGpu() const { return true; }
// PINNED memory allows direct DMA transfers by the GPU to and from system
// memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
if (size <= 0) return nullptr;
void* p;
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
if (size > usable) return nullptr;
// PINNED memory is visible to all CUDA contexts.
cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) {
index = 1;
fallback_alloc_size_ += size;
return p;
}
return nullptr;
}
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err;
PADDLE_ASSERT(index == 1);
PADDLE_ASSERT(fallback_alloc_size_ >= size);
fallback_alloc_size_ -= size;
err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because
// that is returned if you ever call cudaFreeHost after the
// driver has already shutdown. This happens only if the
// process is terminating, in which case we don't care if
// cudaFreeHost succeeds.
if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
}
}
bool CUDAPinnedAllocator::UseGpu() const { return true; }
#endif #endif
} // namespace detail } // namespace detail
......
...@@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator { ...@@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator {
size_t gpu_alloc_size_ = 0; size_t gpu_alloc_size_ = 0;
size_t fallback_alloc_size_ = 0; size_t fallback_alloc_size_ = 0;
}; };
class CUDAPinnedAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t gpu_alloc_size_ =
0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
size_t fallback_alloc_size_ = 0;
};
#endif #endif
} // namespace detail } // namespace detail
......
...@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() { ...@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() {
} }
template <> template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
bool is_pinned) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size); void* p = GetCPUBuddyAllocator()->Alloc(size);
VLOG(10) << " pointer=" << p; VLOG(10) << " pointer=" << p;
...@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { ...@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
} }
template <> template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) { void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
bool is_pinned) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
} }
...@@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
return as[gpu_id]; return as[gpu_id];
} }
BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
static BuddyAllocator** as = NULL;
if (as == NULL) {
int gpu_num = platform::GetCUDADeviceCount();
as = new BuddyAllocator*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
as[gpu] = nullptr;
}
}
platform::SetDeviceId(gpu_id);
if (!as[gpu_id]) {
as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
return as[gpu_id];
}
template <> template <>
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) { size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used(); return GetGPUBuddyAllocator(place.device)->Used();
} }
template <> template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) { void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
auto* buddy_allocator = GetGPUBuddyAllocator(place.device); bool is_pinned) {
auto* ptr = buddy_allocator->Alloc(size); void* ptr;
if (is_pinned) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
} else {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
}
if (ptr == nullptr) { if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId(); int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device); platform::SetDeviceId(place.device);
...@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) { ...@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
} }
template <> template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) { void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
GetGPUBuddyAllocator(place.device)->Free(p); bool is_pinned) {
if (is_pinned) {
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
} else {
GetGPUBuddyAllocator(place.device)->Free(p);
}
} }
#endif #endif
......
...@@ -33,7 +33,7 @@ namespace memory { ...@@ -33,7 +33,7 @@ namespace memory {
* address is valid or not. * address is valid or not.
*/ */
template <typename Place> template <typename Place>
void* Alloc(Place place, size_t size); void* Alloc(Place place, size_t size, bool is_pinned = false);
/** /**
* \brief Free memory block in one place. * \brief Free memory block in one place.
...@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size); ...@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size);
* *
*/ */
template <typename Place> template <typename Place>
void Free(Place place, void* ptr); void Free(Place place, void* ptr, bool is_pinned = false);
/** /**
* \brief Total size of used memory in one place. * \brief Total size of used memory in one place.
...@@ -74,11 +74,13 @@ class PODDeleter { ...@@ -74,11 +74,13 @@ class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
public: public:
explicit PODDeleter(Place place) : place_(place) {} explicit PODDeleter(Place place, bool is_pinned = false)
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); } : place_(place), is_pinned_(is_pinned) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
private: private:
Place place_; Place place_;
bool is_pinned_;
}; };
/** /**
......
...@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { ...@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
EXPECT_EQ(total_size, 0UL); EXPECT_EQ(total_size, 0UL);
for (auto size : for (auto size :
{128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(cpu, size)] = size; ps[paddle::memory::Alloc(cpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk // Buddy Allocator doesn't manage too large memory chunk
...@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) { ...@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
EXPECT_EQ(total_size, 0UL); EXPECT_EQ(total_size, 0UL);
for (auto size : for (auto size :
{128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(gpu, size)] = size; ps[paddle::memory::Alloc(gpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk // Buddy Allocator doesn't manage too large memory chunk
......
...@@ -12,6 +12,8 @@ function(op_library TARGET) ...@@ -12,6 +12,8 @@ function(op_library TARGET)
set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
set(cc_srcs) set(cc_srcs)
set(cu_srcs) set(cu_srcs)
set(hip_cu_srcs)
set(miopen_hip_cc_srcs)
set(cu_cc_srcs) set(cu_cc_srcs)
set(cudnn_cu_cc_srcs) set(cudnn_cu_cc_srcs)
set(CUDNN_FILE) set(CUDNN_FILE)
...@@ -36,10 +38,19 @@ function(op_library TARGET) ...@@ -36,10 +38,19 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
endif()
string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}") string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif() endif()
if(WITH_AMD_GPU)
string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
endif()
endif()
if(WITH_MKLDNN) if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
...@@ -48,10 +59,14 @@ function(op_library TARGET) ...@@ -48,10 +59,14 @@ function(op_library TARGET)
endif() endif()
else() else()
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$") if (${src} MATCHES ".*\\.hip.cu$")
list(APPEND hip_cu_srcs ${src})
elseif (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src}) list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src}) list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
list(APPEND miopen_hip_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src}) list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
...@@ -76,6 +91,9 @@ function(op_library TARGET) ...@@ -76,6 +91,9 @@ function(op_library TARGET)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
elseif (WITH_AMD_GPU)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
...@@ -88,7 +106,7 @@ function(op_library TARGET) ...@@ -88,7 +106,7 @@ function(op_library TARGET)
endif() endif()
endforeach() endforeach()
# The registration of USE_OP, please refer to paddle/framework/op_registry.h. # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
# Note that it's enough to just adding one operator to pybind in a *_op.cc file. # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h. # And for detail pybind information, please see generated paddle/pybind/pybind.h.
file(READ ${TARGET}.cc TARGET_CONTENT) file(READ ${TARGET}.cc TARGET_CONTENT)
...@@ -114,7 +132,10 @@ function(op_library TARGET) ...@@ -114,7 +132,10 @@ function(op_library TARGET)
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -125,9 +146,19 @@ function(op_library TARGET) ...@@ -125,9 +146,19 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif() endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN
if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN # pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
endif() endif()
# pybind USE_OP # pybind USE_OP
...@@ -156,9 +187,13 @@ if(WITH_DISTRIBUTE) ...@@ -156,9 +187,13 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor) cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
else() else()
set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op) set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
endif() endif()
op_library(cond_op DEPS framework_proto tensor net_op) op_library(cond_op DEPS framework_proto tensor net_op)
...@@ -229,3 +264,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor ...@@ -229,3 +264,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "mkldnn_activation_op.h"
#include "paddle/fluid/operators/activation_op.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
namespace {
template <typename T, typename ExecContext>
void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
const T alpha = 0, const T beta = 0) {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();
// get buffers
const auto *src = ctx.template Input<Tensor>("X");
const auto *src_data = src->template data<T>();
auto *dst = ctx.template Output<Tensor>("Out");
const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
// get memory dim
PADDLE_ENFORCE(src->dims().size() == 4,
"Input dim must be with 4, i.e. NCHW");
std::vector<int> src_tz = framework::vectorize2int(src->dims());
// create memory description
// TODO(kbinias-intel): support more formats
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
// create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
auto forward_desc = mkldnn::eltwise_forward::desc(
mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
// save prim desc into global device context to be referred in backward path
const std::string key = ctx.op().Output("Out");
const std::string key_eltwise_pd = key + "@eltwise_pd";
auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
forward_desc, mkldnn_engine);
dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline = {eltwise};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
template <typename T, typename ExecContext>
void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
const T alpha = 0, const T beta = 0) {
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();
// get buffers
const auto *x = ctx.template Input<Tensor>("X");
const auto *src = x->template data<T>();
auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
const auto *diff_dst = dout->template data<T>();
auto *dx =
ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
// get memory dim
std::vector<int> src_tz = framework::vectorize2int(x->dims());
// create memory description
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
// create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
auto diff_src_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
auto diff_dst_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
auto backward_desc =
mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
// retrieve eltwise primitive desc from device context
const std::string key = ctx.op().Input("Out");
const std::string key_eltwise_pd = key + "@eltwise_pd";
const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
PADDLE_ENFORCE(forward_pd != nullptr,
"Fail to find eltwise_pd in device context");
auto *p_forward_pd =
static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
backward_desc, mkldnn_engine, *p_forward_pd);
auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
diff_dst_memory, diff_src_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
} // anonymous namespace
template <typename T, mkldnn::algorithm algorithm>
struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
template <typename ExecContext>
void operator()(const ExecContext &ctx) const {
eltwise_forward<T>(ctx, algorithm);
}
};
template <typename T, mkldnn::algorithm algorithm>
struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
template <typename ExecContext>
void operator()(const ExecContext &ctx) const {
eltwise_grad<T>(ctx, algorithm);
}
};
template <typename T>
using ReluMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
template <typename T>
using TanhMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
template <typename T>
using SqrtMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
template <typename T>
using AbsMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
template <typename T>
using ReluMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
template <typename T>
using TanhMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
template <typename T>
using SqrtMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
template <typename T>
using AbsMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationKernel<ops::functor<float>>); \
REGISTER_OP_KERNEL( \
act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \
__macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
__macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
__macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
__macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mkldnn_activation_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Relu operator"); AddInput("X", "Input of Relu operator");
AddOutput("Out", "Output of Relu operator"); AddOutput("Out", "Output of Relu operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Relu Activation Operator. Relu Activation Operator.
...@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Tanh operator"); AddInput("X", "Input of Tanh operator");
AddOutput("Out", "Output of Tanh operator"); AddOutput("Out", "Output of Tanh operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Tanh Activation Operator. Tanh Activation Operator.
...@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Sqrt operator"); AddInput("X", "Input of Sqrt operator");
AddOutput("Out", "Output of Sqrt operator"); AddOutput("Out", "Output of Sqrt operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Sqrt Activation Operator. Sqrt Activation Operator.
...@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Abs operator"); AddInput("X", "Input of Abs operator");
AddOutput("Out", "Output of Abs operator"); AddOutput("Out", "Output of Abs operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Abs Activation Operator. Abs Activation Operator.
...@@ -524,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker, ...@@ -524,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad, REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad, REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad, REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
tanh_shrink_grad, ops::ActivationOpGrad); tanh_shrink_grad, ops::ActivationOpGrad);
...@@ -536,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, ...@@ -536,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker, REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
softshrink_grad, ops::ActivationOpGrad); softshrink_grad, ops::ActivationOpGrad);
REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad, REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad, REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad, REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
...@@ -613,3 +626,14 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad, ...@@ -613,3 +626,14 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
ops::grad_functor<double>>); ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
REGISTER_OP_CPU_KERNEL(relu,
ops::ActivationKernel<paddle::platform::CPUDeviceContext,
ops::ReluFunctor<float>>,
ops::ActivationKernel<paddle::platform::CPUDeviceContext,
ops::ReluFunctor<double>>);
REGISTER_OP_CPU_KERNEL(
relu_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
ops::ReluGradFunctor<float>>,
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
ops::ReluGradFunctor<double>>);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
...@@ -31,3 +32,16 @@ namespace ops = paddle::operators; ...@@ -31,3 +32,16 @@ namespace ops = paddle::operators;
ops::grad_functor<double>>); ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
REGISTER_OP_CUDA_KERNEL(
relu, ops::ActivationKernel<paddle::platform::CUDADeviceContext,
ops::ReluFunctor<float>>,
ops::ActivationKernel<paddle::platform::CUDADeviceContext,
ops::ReluFunctor<double>>,
ops::ActivationKernel<paddle::platform::CUDADeviceContext,
ops::ReluFunctor<paddle::platform::float16>>);
REGISTER_OP_CUDA_KERNEL(
relu_grad, ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
ops::ReluGradFunctor<float>>,
ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
ops::ReluGradFunctor<double>>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -17,6 +17,10 @@ limitations under the License. */ ...@@ -17,6 +17,10 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -772,7 +776,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -772,7 +776,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \
__macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
__macro(exp, ExpFunctor, ExpGradFunctor); \ __macro(exp, ExpFunctor, ExpGradFunctor); \
__macro(relu, ReluFunctor, ReluGradFunctor); \
__macro(tanh, TanhFunctor, TanhGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \
__macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/average_accumulates_op.h"
namespace paddle {
namespace operators {
template <>
void GetAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
num_updates_ = in_num_updates->data<int64_t>()[0];
}
template <>
void SetAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t num_updates_,
int64_t num_accumulates_, int64_t old_num_accumulates_) {
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
out_num_updates->data<int64_t>()[0] = num_updates_;
}
class AverageAccumulatesOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("param"),
"Input (param) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_sum_1"),
"Input (sum_1) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_sum_2"),
"Input (sum_2) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_sum_3"),
"Input (sum_3) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_num_accumulates"),
"Input (in_num_accumulates) of average_accumulates op should "
"not be null.");
PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
"Input (old_num_accumulates) of average_accumulates op "
"should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_num_updates"),
"Input (num_updates) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("out_sum_1"),
"Output (sum_1) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("out_sum_2"),
"Output (sum_2) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("out_sum_3"),
"Output (sum_3) of average_accumulates op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
"Output (num_accumulates) of average_accumulates op should "
"not be null.");
PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
"Output (old_num_accumulates) of average_accumulates op "
"should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("out_num_updates"),
"Output (num_updates) of average_accumulates op should not be null.");
auto in_dim = ctx->GetInputDim("param");
ctx->SetOutputDim("out_sum_1", in_dim);
ctx->SetOutputDim("out_sum_2", in_dim);
ctx->SetOutputDim("out_sum_3", in_dim);
ctx->SetOutputDim("out_num_accumulates", {1});
ctx->SetOutputDim("out_old_num_accumulates", {1});
ctx->SetOutputDim("out_num_updates", {1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("param")->type()),
ctx.GetPlace());
}
};
class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
public:
AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("param", "(Tensor), The parameter to be accumulated.");
AddInput("in_sum_1",
"(Tensor), A tensor used to store the parameter "
"sums with the same shape as input(param).");
AddInput("in_sum_2",
"(Tensor), A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param). It is used to avoid loss of precision due to too "
"many sums.");
AddInput("in_sum_3",
"(Tensor), A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param).");
AddInput("in_num_accumulates",
"(Tensor<int64_t>), The accumulating times of current window with "
"shape [1].");
AddInput(
"in_old_num_accumulates",
"(Tensor<int64_t>), The accumulating times of previous window with "
"shape [1].");
AddInput("in_num_updates",
"(Tensor<int64_t>), The total number of batches used by trainning "
"before this batch with shape [1].");
AddOutput("out_sum_1",
"(Tensor), A tensor used to store the "
"parameter sums with the same shape as input(param).");
AddOutput("out_sum_2",
"(Tensor), A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param). It is used to avoid loss of precision due to too "
"many sums.");
AddOutput("out_sum_3",
"(Tensor), A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param).");
AddOutput(
"out_num_accumulates",
"(Tensor<int64_t>), The accumulating times of current window with "
"shape [1].");
AddOutput(
"out_old_num_accumulates",
"(Tensor<int64_t>) The accumulating times of previous window with "
"shape [1].");
AddOutput(
"out_num_updates",
"(Tensor<int64_t>), The total number of batches used by trainning "
"before this batch with shape [1].");
AddAttr<float>("average_window",
"(float, default 0) "
"The rate of average window size relative to num_updates.")
.SetDefault(0);
AddAttr<int64_t>("max_average_window",
"(int64_t) "
"Maximum size of average window. It suggests that the "
"number of mini-batches "
"in one pass is appropriate value to set.");
AddAttr<int64_t>("min_average_window",
"(int64_t, default 10000L) "
"Minimu size of average window.")
.SetDefault(10000L);
AddComment(R"DOC(
AverageAccumulates Operator.
Accumulate the sum of parameter whtin sliding window. The size of sliding window is
determined by 'average_window', 'max_average_window' and 'min_average_window'.
Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
All the accumulators were inited to zero before training.
And for a mini-batch in training, accumulators were computed as below steps:
num_updates += 1
num_accumulates += 1
sum_1 += param
if num_updates % kMaxNumAccumulates == 0:
sum_2 += sum_1
sum_1 = 0
if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
sum_3 = sum_1 + sum_2
sum_1 = 0
sum_2 = 0
old_num_accumulates = num_accumulates
num_accumulates = 0
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
ops::AverageAccumulatesOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
average_accumulates,
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/average_accumulates_op.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
template <>
void GetAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
}
template <>
void SetAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t num_updates_,
int64_t num_accumulates_, int64_t old_num_accumulates_) {
auto stream = ctx.cuda_device_context().stream();
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
stream);
memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
stream);
memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
average_accumulates,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext>
void GetAccumulators(const framework::ExecutionContext& ctx,
int64_t& num_updates, int64_t& num_accumulates,
int64_t& old_num_accumulates);
template <typename DeviceContext>
void SetAccumulators(const framework::ExecutionContext& ctx,
int64_t num_updates, int64_t num_accumulates,
int64_t old_num_accumulates);
template <typename DeviceContext, typename T>
class AverageAccumulatesKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// It is used to avoid loss of precision
static const int64_t kMaxNumAccumulates = 16384;
// Get accumulators from input
int64_t num_updates = 0;
int64_t num_accumulates = 0;
int64_t old_num_accumulates = 0;
GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
old_num_accumulates);
// Get attrs
float average_window = ctx.Attr<float>("average_window");
int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
min_average_window =
std::min<int64_t>(min_average_window, max_average_window);
// Get inputs
auto* param = ctx.Input<Tensor>("param");
auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
auto param_tensor = EigenVector<T>::Flatten(*param);
auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
// Get outputs
auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
// Compute
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
math::SetConstant<DeviceContext, T> constant_functor;
++num_updates;
++num_accumulates;
out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
out_sum_2_tensor.device(place) = in_sum_2_tensor;
out_sum_3_tensor.device(place) = in_sum_3_tensor;
if (num_updates % kMaxNumAccumulates == 0) {
// Move the sum to a different buffer to avoid loss of precision due to
// too many sums.
out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
0.0);
}
if (num_accumulates >= min_average_window &&
num_accumulates >= std::min<int64_t>(max_average_window,
num_updates * average_window)) {
// Now the average window is too long, discard the old sum.
out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
0.0);
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
0.0);
old_num_accumulates = num_accumulates;
num_accumulates = 0;
}
// Set accumulators to output
SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
old_num_accumulates);
}
};
} // namespace operators
} // namespace paddle
...@@ -80,6 +80,29 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -80,6 +80,29 @@ class BatchNormOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("SavedVariance", {C}); ctx->SetOutputDim("SavedVariance", {C});
ctx->ShareLoD("X", "Y"); ctx->ShareLoD("X", "Y");
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type());
// For float or float16 input tensor, the type of the scale, bias, mean,
// and var tensors should both be float.
auto bn_param_type = framework::proto::VarType::FP32;
PADDLE_ENFORCE_EQ(bn_param_type,
framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
"Scale input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type,
framework::ToDataType(ctx.Input<Tensor>("Bias")->type()),
"Bias input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type,
framework::ToDataType(ctx.Input<Tensor>("Mean")->type()),
"Mean input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
ctx.Input<Tensor>("Variance")->type()),
"Variance input should be of float type");
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -434,12 +457,39 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T> ...@@ -434,12 +457,39 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
} }
}; };
class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *op = new framework::OpDesc();
op->SetType("batch_norm_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetInput("Scale", Input("Scale"));
op->SetInput("SavedMean", Output("SavedMean"));
op->SetInput("SavedVariance", Output("SavedVariance"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
batch_norm_grad, ops::BatchNormGradOp); ops::BatchNormGradMaker);
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
batch_norm, batch_norm,
ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>); ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <cfloat> #include <cfloat>
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -26,6 +27,8 @@ using Tensor = framework::Tensor; ...@@ -26,6 +27,8 @@ using Tensor = framework::Tensor;
using DataLayout = framework::DataLayout; using DataLayout = framework::DataLayout;
template <typename T> template <typename T>
using CudnnDataType = platform::CudnnDataType<T>; using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout, void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
int *N, int *C, int *H, int *W, int *D) { int *N, int *C, int *H, int *W, int *D) {
...@@ -104,8 +107,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -104,8 +107,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type, data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
// Note: PERSISTENT not implemented for inference
CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_, data_desc_, mode_)); bn_param_desc_, data_desc_, is_test ? CUDNN_BATCHNORM_SPATIAL : mode_));
const auto *scale = ctx.Input<Tensor>("Scale"); const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias"); const auto *bias = ctx.Input<Tensor>("Bias");
...@@ -118,15 +122,16 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -118,15 +122,16 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
// alloc memory // alloc memory
y->mutable_data<T>(ctx.GetPlace()); y->mutable_data<T>(ctx.GetPlace());
mean_out->mutable_data<T>(ctx.GetPlace()); mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<T>(ctx.GetPlace()); variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_mean->mutable_data<T>(ctx.GetPlace()); saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<T>(ctx.GetPlace()); saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
math::SetConstant<platform::CUDADeviceContext, T> functor; math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor(dev_ctx, saved_mean, 0); functor;
functor(dev_ctx, saved_variance, 0); functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
...@@ -147,8 +152,10 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -147,8 +152,10 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(), CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(), CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
data_desc_, y->template mutable_data<T>(ctx.GetPlace()), data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<T>(), bias->template data<T>(), bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
est_mean->template data<T>(), est_var->template data<T>(), epsilon)); bias->template data<BatchNormParamType<T>>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(), epsilon));
} else { } else {
// Run training mode. // Run training mode.
// obtain running mean and running inv var, and see if we need to // obtain running mean and running inv var, and see if we need to
...@@ -159,11 +166,16 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -159,11 +166,16 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
data_desc_, x->template data<T>(), data_desc_, data_desc_, x->template data<T>(), data_desc_,
y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_, y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
scale->template data<T>(), bias->template data<T>(), this_factor, scale->template data<BatchNormParamType<T>>(),
mean_out->template mutable_data<T>(ctx.GetPlace()), bias->template data<BatchNormParamType<T>>(), this_factor,
variance_out->template mutable_data<T>(ctx.GetPlace()), epsilon, mean_out->template mutable_data<BatchNormParamType<T>>(
saved_mean->template mutable_data<T>(ctx.GetPlace()), ctx.GetPlace()),
saved_variance->template mutable_data<T>(ctx.GetPlace()))); variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())));
} }
// clean when exit. // clean when exit.
...@@ -270,9 +282,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -270,9 +282,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
batch_norm, batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>); ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>);
ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
...@@ -126,6 +126,7 @@ width and height. ...@@ -126,6 +126,7 @@ width and height.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker); REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>, REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
ops::BoxCoderKernel<double>); ops::BoxCoderKernel<double>);
...@@ -23,21 +23,10 @@ limitations under the License. */ ...@@ -23,21 +23,10 @@ limitations under the License. */
static constexpr char Channel[] = "Channel"; static constexpr char Channel[] = "Channel";
static constexpr char X[] = "X"; static constexpr char X[] = "X";
static constexpr char Status[] = "Status";
static constexpr char copy[] = "copy";
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void SetSendStatus(const platform::Place &dev_place,
framework::Variable &status_var, bool status) {
auto cpu = platform::CPUPlace();
auto status_tensor =
status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
cpu);
status_tensor[0] = status;
}
class ChannelSendOp : public framework::OperatorBase { class ChannelSendOp : public framework::OperatorBase {
public: public:
ChannelSendOp(const std::string &type, ChannelSendOp(const std::string &type,
...@@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase { ...@@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase {
"Input(Channel) of ChannelSendOp should not be null."); "Input(Channel) of ChannelSendOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(X), PADDLE_ENFORCE(ctx->HasInput(X),
"Input(X) of ChannelSendOp should not be null."); "Input(X) of ChannelSendOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(Status),
"Output(Status) of ChannelSendOp should not be null.");
ctx->SetOutputDim("Status", {1});
} }
private: private:
...@@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase { ...@@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase {
auto input_var = scope.FindVar(Input(X)); auto input_var = scope.FindVar(Input(X));
// Send the input data through the channel. // Send the input data through the channel.
bool ok = concurrency::ChannelSend(ch, input_var); concurrency::ChannelSend(ch, input_var);
// Set the status output of the `ChannelSend` call.
SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
} }
}; };
...@@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
.AsDuplicable(); .AsDuplicable();
AddInput(X, "(Variable) The value which gets sent by the channel.") AddInput(X, "(Variable) The value which gets sent by the channel.")
.AsDuplicable(); .AsDuplicable();
AddOutput(Status,
"(Tensor) An LoD Tensor that returns a boolean status of the"
"result of the send operation.")
.AsDuplicable();
AddAttr<bool>(copy, "(bool, default false) Should copy before send")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
)DOC"); )DOC");
} }
......
...@@ -17,20 +17,20 @@ limitations under the License. */ ...@@ -17,20 +17,20 @@ limitations under the License. */
namespace poc = paddle::operators::concurrency; namespace poc = paddle::operators::concurrency;
bool poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
auto type = framework::ToVarType(var->Type()); auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR) if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Send(var->GetMutable<framework::LoDTensor>()); ch->Send(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Send(var->GetMutable<framework::LoDRankTable>()); ch->Send(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Send(var->GetMutable<framework::LoDTensorArray>()); ch->Send(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS) else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Send(var->GetMutable<framework::SelectedRows>()); ch->Send(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER) else if (type == framework::proto::VarType_Type_READER)
return ch->Send(var->GetMutable<framework::ReaderHolder>()); ch->Send(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL) else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Send(var->GetMutable<framework::ChannelHolder>()); ch->Send(var->GetMutable<framework::ChannelHolder>());
else else
PADDLE_THROW("ChannelSend:Unsupported type"); PADDLE_THROW("ChannelSend:Unsupported type");
} }
......
...@@ -21,7 +21,7 @@ namespace paddle { ...@@ -21,7 +21,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace concurrency { namespace concurrency {
bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var); bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
......
...@@ -78,7 +78,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> { ...@@ -78,7 +78,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
for (int64_t i = 0; i < batch_size; ++i) { for (int64_t i = 0; i < batch_size; ++i) {
PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num); PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
int64_t index = i * class_num + label_data[i]; int64_t index = i * class_num + label_data[i];
dx_data[index] = -dy_data[i] / x_data[index]; dx_data[index] = math::TolerableValue<T>()(-dy_data[i] / x_data[index]);
} }
} }
} }
......
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
cares zlib protobuf sendrecvop_grpc)
endif() endif()
...@@ -23,9 +23,107 @@ limitations under the License. */ ...@@ -23,9 +23,107 @@ limitations under the License. */
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
namespace grpc {
// A ZeroCopyInputStream that reads from grpc_byte_buffer
class GrpcBufferReader final
: public ::google::protobuf::io::ZeroCopyInputStream {
typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
grpc_byte_buffer* buffer) {
(g_core_codegen_interface->*ptr)(reader, buffer);
}
void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
grpc_byte_buffer* buffer) {
int result = (g_core_codegen_interface->*ptr)(reader, buffer);
(void)result;
}
public:
explicit GrpcBufferReader(grpc_byte_buffer* buffer)
: byte_count_(0), backup_count_(0) {
ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
buffer);
}
~GrpcBufferReader() override {
g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
}
bool Next(const void** data, int* size) override {
if (backup_count_ > 0) {
*data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
backup_count_;
GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
*size = (int)backup_count_;
backup_count_ = 0;
return true;
}
if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
&slice_)) {
return false;
}
g_core_codegen_interface->grpc_slice_unref(slice_);
*data = GRPC_SLICE_START_PTR(slice_);
// On win x64, int is only 32bit
GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
return true;
}
void BackUp(int count) override { backup_count_ = count; }
bool Skip(int count) override {
const void* data;
int size;
while (Next(&data, &size)) {
if (size >= count) {
BackUp(size - count);
return true;
}
// size < count;
count -= size;
}
// error or we have too large count;
return false;
}
::google::protobuf::int64 ByteCount() const override {
return byte_count_ - backup_count_;
}
private:
int64_t byte_count_;
int64_t backup_count_;
grpc_byte_buffer_reader reader_;
grpc_slice slice_;
};
}; // namespace grpc
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
// Source provides a way for a particular RPC implementation to provide
// received data to ParseFrom.
class Source {
public:
virtual ~Source() {}
// Return the stream that contains the data to be parsed.
// Note that this method might be invoked more than once if
// ParseFrom needs to fall back to a more expensive parsing method.
// Every call must return a stream pointing at the beginning of
// the serialized RecvTensorResponse.
//
// Note that a subsequent call to contents() invalidates previous
// results of contents().
//
// Ownership of the returned stream is retained by the Source and
// should not be deleted by the caller.
virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
};
// A ZeroCopyInputStream that reads from a grpc::ByteBuffer. // A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
class GrpcByteBufferSource class GrpcByteBufferSource
...@@ -46,6 +144,43 @@ class GrpcByteBufferSource ...@@ -46,6 +144,43 @@ class GrpcByteBufferSource
::google::protobuf::int64 byte_count_; ::google::protobuf::int64 byte_count_;
}; };
class GrpcByteBufferSourceWrapper : public Source {
public:
explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
: source_(source) {}
::google::protobuf::io::ZeroCopyInputStream* contents() override {
return source_;
}
private:
GrpcByteBufferSource* source_;
};
class GrpcByteSource : public Source {
public:
explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
~GrpcByteSource() override { DeleteStream(); }
typedef ::grpc::GrpcBufferReader Reader;
::google::protobuf::io::ZeroCopyInputStream* contents() override {
DeleteStream();
stream_ = new (&space_) Reader(buffer_);
return stream_;
}
private:
void DeleteStream() {
if (stream_) {
stream_->~Reader();
}
}
grpc_byte_buffer* buffer_; // Not owned
Reader* stream_ = nullptr; // Points into space_ if non-nullptr
char space_[sizeof(Reader)];
};
} // namespace detail } // namespace detail
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "grpc_client.h" #include "grpc_client.h"
#include <sys/time.h>
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
...@@ -31,8 +33,9 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -31,8 +33,9 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
auto* var = p_scope->FindVar(var_name_val); auto* var = p_scope->FindVar(var_name_val);
sendrecv::VariableMessage req;
SerializeToMessage(var_name_val, var, *p_ctx, &req); ::grpc::ByteBuffer req;
SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
// varhandle // varhandle
VarHandle var_h; VarHandle var_h;
...@@ -46,8 +49,10 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -46,8 +49,10 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
s->Prepare(var_h, time_out); s->Prepare(var_h, time_out);
s->response_call_back_ = NULL; s->response_call_back_ = NULL;
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto call = s->stub_g_.PrepareUnaryCall(
rpc->Finish(&s->reply_, &s->status_, (void*)s); s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
call->StartCall();
call->Finish(&s->reply_, &s->status_, (void*)s);
}); });
req_count_++; req_count_++;
...@@ -56,9 +61,19 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -56,9 +61,19 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
} }
void ProcGetResponse(const VarHandle& var_h, void ProcGetResponse(const VarHandle& var_h,
const sendrecv::VariableMessage& ret_msg) { // const sendrecv::VariableMessage& ret_msg) {
auto* outvar = var_h.scope->FindVar(var_h.name); const ::grpc::ByteBuffer& ret_msg) {
DeserializeFromMessage(ret_msg, *var_h.ctx, outvar); framework::Variable* outvar = NULL;
DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, outvar);
}
template <typename T>
void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
::grpc::Slice slice(proto.ByteSizeLong());
proto.SerializeWithCachedSizesToArray(
const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(slice.begin())));
::grpc::ByteBuffer tmp(&slice, 1);
result->Swap(&tmp);
} }
bool RPCClient::AsyncGetVariable(const std::string& ep, bool RPCClient::AsyncGetVariable(const std::string& ep,
...@@ -88,8 +103,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -88,8 +103,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
s->Prepare(var_h, time_out); s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); ::grpc::ByteBuffer buf;
rpc->Finish(&s->reply_, &s->status_, (void*)s); RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
call->StartCall();
call->Finish(&s->reply_, &s->status_, (void*)s);
}); });
req_count_++; req_count_++;
......
...@@ -25,6 +25,11 @@ limitations under the License. */ ...@@ -25,6 +25,11 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include <grpc++/generic/generic_stub.h>
#include <grpc++/grpc++.h>
#include <grpc++/support/byte_buffer.h>
#include <grpc++/support/slice.h>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -49,15 +54,11 @@ struct VarHandle { ...@@ -49,15 +54,11 @@ struct VarHandle {
} }
}; };
void ProcGetResponse(const VarHandle& var_h, void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
const sendrecv::VariableMessage& msg);
class BaseProcessor { class BaseProcessor {
public: public:
explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) { explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) { context_ = NULL; }
stub_ = sendrecv::SendRecvService::NewStub(ch);
context_ = NULL;
}
virtual ~BaseProcessor() {} virtual ~BaseProcessor() {}
...@@ -82,19 +83,18 @@ class BaseProcessor { ...@@ -82,19 +83,18 @@ class BaseProcessor {
virtual void Process() = 0; virtual void Process() = 0;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
std::unique_ptr<grpc::ClientContext> context_; std::unique_ptr<grpc::ClientContext> context_;
grpc::Status status_; grpc::Status status_;
VarHandle var_h_; VarHandle var_h_;
}; };
typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)> typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
RequestSendCallBack; RequestSendCallBack;
class SendProcessor : public BaseProcessor { class SendProcessor : public BaseProcessor {
public: public:
explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {} : BaseProcessor(ch), stub_g_(ch) {}
virtual ~SendProcessor() {} virtual ~SendProcessor() {}
...@@ -104,17 +104,18 @@ class SendProcessor : public BaseProcessor { ...@@ -104,17 +104,18 @@ class SendProcessor : public BaseProcessor {
} }
} }
sendrecv::VoidMessage reply_; ::grpc::GenericStub stub_g_;
::grpc::ByteBuffer reply_;
RequestSendCallBack response_call_back_ = NULL; RequestSendCallBack response_call_back_ = NULL;
}; };
typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)> typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
RequestGetCallBack; RequestGetCallBack;
class GetProcessor : public BaseProcessor { class GetProcessor : public BaseProcessor {
public: public:
explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {} : BaseProcessor(ch), stub_g_(ch) {}
virtual ~GetProcessor() {} virtual ~GetProcessor() {}
...@@ -124,30 +125,37 @@ class GetProcessor : public BaseProcessor { ...@@ -124,30 +125,37 @@ class GetProcessor : public BaseProcessor {
} }
} }
sendrecv::VariableMessage reply_; ::grpc::ByteBuffer reply_;
::grpc::GenericStub stub_g_;
RequestGetCallBack response_call_back_ = ProcGetResponse; RequestGetCallBack response_call_back_ = ProcGetResponse;
}; };
class BatchBarrierProcessor : public BaseProcessor { class BatchBarrierProcessor : public BaseProcessor {
public: public:
explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch) explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {} : BaseProcessor(ch) {
stub_ = sendrecv::SendRecvService::NewStub(ch);
}
virtual ~BatchBarrierProcessor() {} virtual ~BatchBarrierProcessor() {}
virtual void Process() {} virtual void Process() {}
sendrecv::VoidMessage reply_; sendrecv::VoidMessage reply_;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
}; };
class FetchBarrierProcessor : public BaseProcessor { class FetchBarrierProcessor : public BaseProcessor {
public: public:
explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch) explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {} : BaseProcessor(ch) {
stub_ = sendrecv::SendRecvService::NewStub(ch);
}
virtual ~FetchBarrierProcessor() {} virtual ~FetchBarrierProcessor() {}
virtual void Process() {} virtual void Process() {}
sendrecv::VariableMessage reply_; sendrecv::VariableMessage reply_;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
}; };
class RPCClient { class RPCClient {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_server.h" #include "paddle/fluid/operators/detail/grpc_server.h"
using grpc::ServerAsyncResponseWriter; using ::grpc::ServerAsyncResponseWriter;
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -26,9 +26,10 @@ enum CallStatus { PROCESS = 0, FINISH }; ...@@ -26,9 +26,10 @@ enum CallStatus { PROCESS = 0, FINISH };
// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server // https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
class RequestBase { class RequestBase {
public: public:
explicit RequestBase(sendrecv::SendRecvService::AsyncService* service, explicit RequestBase(GrpcService::AsyncService* service,
grpc::ServerCompletionQueue* cq) ::grpc::ServerCompletionQueue* cq,
: service_(service), cq_(cq), status_(PROCESS) { const platform::DeviceContext* dev_ctx)
: service_(service), cq_(cq), status_(PROCESS), dev_ctx_(dev_ctx) {
PADDLE_ENFORCE(cq_); PADDLE_ENFORCE(cq_);
} }
virtual ~RequestBase() {} virtual ~RequestBase() {}
...@@ -42,55 +43,58 @@ class RequestBase { ...@@ -42,55 +43,58 @@ class RequestBase {
} }
protected: protected:
grpc::ServerContext ctx_; ::grpc::ServerContext ctx_;
sendrecv::SendRecvService::AsyncService* service_; GrpcService::AsyncService* service_;
grpc::ServerCompletionQueue* cq_; ::grpc::ServerCompletionQueue* cq_;
CallStatus status_; CallStatus status_;
const platform::DeviceContext* dev_ctx_;
}; };
typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
class RequestSend final : public RequestBase { class RequestSend final : public RequestBase {
public: public:
explicit RequestSend(sendrecv::SendRecvService::AsyncService* service, explicit RequestSend(GrpcService::AsyncService* service,
grpc::ServerCompletionQueue* cq, ::grpc::ServerCompletionQueue* cq,
SimpleBlockQueue<MessageWithName>* queue) framework::Scope* scope, ReceivedQueue* queue,
: RequestBase(service, cq), queue_(queue), responder_(&ctx_) { const platform::DeviceContext* dev_ctx)
service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_, : RequestBase(service, cq, dev_ctx), queue_(queue), responder_(&ctx_) {
this); request_.reset(new VariableResponse(scope, dev_ctx_));
int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
cq_, cq_, this);
} }
virtual ~RequestSend() {} virtual ~RequestSend() {}
virtual std::string GetReqName() { return request_.varname(); } virtual std::string GetReqName() { return request_->Varname(); }
virtual void Process() { virtual void Process() {
MessageWithName msg_with_name = queue_->Push(std::make_pair(request_->Varname(), request_));
std::make_pair(request_.varname(), std::move(request_));
queue_->Push(std::move(msg_with_name)); sendrecv::VoidMessage reply;
responder_.Finish(reply_, grpc::Status::OK, this); responder_.Finish(reply, ::grpc::Status::OK, this);
status_ = FINISH; status_ = FINISH;
} }
protected: protected:
sendrecv::VariableMessage request_; std::shared_ptr<VariableResponse> request_;
sendrecv::VoidMessage reply_; ReceivedQueue* queue_;
SimpleBlockQueue<MessageWithName>* queue_;
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_; ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
}; };
class RequestGet final : public RequestBase { class RequestGet final : public RequestBase {
public: public:
explicit RequestGet(sendrecv::SendRecvService::AsyncService* service, explicit RequestGet(GrpcService::AsyncService* service,
grpc::ServerCompletionQueue* cq, framework::Scope* scope, ::grpc::ServerCompletionQueue* cq,
framework::Scope* scope,
const platform::DeviceContext* dev_ctx, const platform::DeviceContext* dev_ctx,
SimpleBlockQueue<MessageWithName>* queue) SimpleBlockQueue<MessageWithName>* queue)
: RequestBase(service, cq), : RequestBase(service, cq, dev_ctx),
responder_(&ctx_), responder_(&ctx_),
scope_(scope), scope_(scope),
dev_ctx_(dev_ctx),
queue_(queue) { queue_(queue) {
service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this); int method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
cq_, this);
} }
virtual ~RequestGet() {} virtual ~RequestGet() {}
...@@ -101,24 +105,26 @@ class RequestGet final : public RequestBase { ...@@ -101,24 +105,26 @@ class RequestGet final : public RequestBase {
// proc request. // proc request.
std::string var_name = request_.varname(); std::string var_name = request_.varname();
auto* var = scope_->FindVar(var_name); auto* var = scope_->FindVar(var_name);
::grpc::ByteBuffer reply;
if (var_name != FETCH_BARRIER_MESSAGE) { if (var_name != FETCH_BARRIER_MESSAGE) {
SerializeToMessage(var_name, var, *dev_ctx_, &reply_); SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
} }
// TODO(gongwb): check var's info.
responder_.Finish(reply_, grpc::Status::OK, this); responder_.Finish(reply, ::grpc::Status::OK, this);
status_ = FINISH; status_ = FINISH;
MessageWithName msg_with_name =
// request name reply if (var_name == FETCH_BARRIER_MESSAGE) {
std::make_pair(var_name, std::move(reply_)); sendrecv::VariableMessage msg;
queue_->Push(msg_with_name); MessageWithName msg_with_name = std::make_pair(var_name, msg);
queue_->Push(msg_with_name);
}
} }
protected: protected:
sendrecv::VariableMessage request_; sendrecv::VariableMessage request_;
sendrecv::VariableMessage reply_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
framework::Scope* scope_; framework::Scope* scope_;
const platform::DeviceContext* dev_ctx_;
SimpleBlockQueue<MessageWithName>* queue_; SimpleBlockQueue<MessageWithName>* queue_;
}; };
...@@ -133,8 +139,8 @@ void AsyncGRPCServer::WaitClientGet(int count) { ...@@ -133,8 +139,8 @@ void AsyncGRPCServer::WaitClientGet(int count) {
} }
void AsyncGRPCServer::RunSyncUpdate() { void AsyncGRPCServer::RunSyncUpdate() {
grpc::ServerBuilder builder; ::grpc::ServerBuilder builder;
builder.AddListeningPort(address_, grpc::InsecureServerCredentials()); builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials());
builder.SetMaxSendMessageSize(std::numeric_limits<int>::max()); builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max()); builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
builder.RegisterService(&service_); builder.RegisterService(&service_);
...@@ -182,8 +188,8 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() { ...@@ -182,8 +188,8 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
if (is_shut_down_) { if (is_shut_down_) {
return; return;
} }
RequestSend* send = RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
new RequestSend(&service_, cq_send_.get(), &var_recv_queue_); &var_recv_queue_, dev_ctx_);
VLOG(4) << "Create RequestSend status:" << send->Status(); VLOG(4) << "Create RequestSend status:" << send->Status();
} }
...@@ -198,7 +204,7 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() { ...@@ -198,7 +204,7 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
} }
// FIXME(typhoonzero): change cq_name to enum. // FIXME(typhoonzero): change cq_name to enum.
void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq, void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
std::string cq_name, std::string cq_name,
std::function<void()> TryToRegisterNewOne) { std::function<void()> TryToRegisterNewOne) {
TryToRegisterNewOne(); TryToRegisterNewOne();
......
...@@ -14,28 +14,31 @@ limitations under the License. */ ...@@ -14,28 +14,31 @@ limitations under the License. */
#pragma once #pragma once
#include <grpc++/grpc++.h>
#include <thread>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/simple_block_queue.h" #include "paddle/fluid/operators/detail/grpc_service.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h"
#include <grpc++/grpc++.h>
#include <grpc/support/log.h>
#include <thread>
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/simple_block_queue.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
typedef std::pair<std::string, std::shared_ptr<VariableResponse>>
ReceivedMessage;
typedef SimpleBlockQueue<ReceivedMessage> ReceivedQueue;
typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName; typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
class RequestBase; class RequestBase;
class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { class AsyncGRPCServer final {
public: public:
explicit AsyncGRPCServer(const std::string &address) : address_(address) {} explicit AsyncGRPCServer(const std::string &address) : address_(address) {}
...@@ -50,14 +53,16 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { ...@@ -50,14 +53,16 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; } void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
const MessageWithName Get() { return this->var_recv_queue_.Pop(); } const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); } void Push(const std::string &msg_name) {
this->var_recv_queue_.Push(std::make_pair(msg_name, nullptr));
}
void ShutDown(); void ShutDown();
protected: protected:
void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name, void HandleRequest(::grpc::ServerCompletionQueue *cq, std::string cq_name,
std::function<void()> TryToRegisterNewOne); std::function<void()> TryToRegisterNewOne);
void TryToRegisterNewSendOne(); void TryToRegisterNewSendOne();
void TryToRegisterNewGetOne(); void TryToRegisterNewGetOne();
...@@ -66,18 +71,19 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { ...@@ -66,18 +71,19 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
private: private:
std::mutex cq_mutex_; std::mutex cq_mutex_;
volatile bool is_shut_down_ = false; volatile bool is_shut_down_ = false;
std::unique_ptr<grpc::ServerCompletionQueue> cq_send_; std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
std::unique_ptr<grpc::ServerCompletionQueue> cq_get_; std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
sendrecv::SendRecvService::AsyncService service_; GrpcService::AsyncService service_;
std::unique_ptr<grpc::Server> server_; std::unique_ptr<::grpc::Server> server_;
std::string address_; std::string address_;
framework::Scope *scope_; framework::Scope *scope_;
const platform::DeviceContext *dev_ctx_; const platform::DeviceContext *dev_ctx_;
// received variable from RPC, operators fetch variable from this queue. // received variable from RPC, operators fetch variable from this queue.
SimpleBlockQueue<MessageWithName> var_recv_queue_;
SimpleBlockQueue<MessageWithName> var_get_queue_; SimpleBlockQueue<MessageWithName> var_get_queue_;
ReceivedQueue var_recv_queue_;
// condition of the sub program // condition of the sub program
std::mutex barrier_mutex_; std::mutex barrier_mutex_;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <grpc++/impl/codegen/async_stream.h>
#include <grpc++/impl/codegen/async_unary_call.h>
#include <grpc++/impl/codegen/proto_utils.h>
#include <grpc++/impl/codegen/rpc_method.h>
#include <grpc++/impl/codegen/service_type.h>
#include <grpc++/impl/codegen/status.h>
#include <grpc++/impl/codegen/stub_options.h>
#include <grpc++/impl/codegen/sync_stream.h>
#include <grpc++/support/byte_buffer.h>
#include "paddle/fluid/operators/detail/variable_response.h"
// NOTE: This method was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// method and did some modifications so that we can parse gRPC
// requests without too much copying of the tensor data.
namespace grpc {
class CompletionQueue;
class Channel;
class RpcService;
class ServerCompletionQueue;
class ServerContext;
// Support parsing/unparsing of tensorflow::VariableResponse.
// Wire-format is identical to RecvVariableResponse.
template <>
class SerializationTraits<paddle::operators::detail::VariableResponse> {
public:
static Status Serialize(
const paddle::operators::detail::VariableResponse& msg,
grpc_byte_buffer** bp, bool* own_buffer) {
PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
return Status();
}
static Status Deserialize(grpc_byte_buffer* buffer,
paddle::operators::detail::VariableResponse* msg,
int max_message_size = INT_MAX) {
if (buffer == nullptr) {
return Status(StatusCode::INTERNAL, "No payload");
}
Status result = g_core_codegen_interface->ok();
if (result.ok()) {
paddle::operators::detail::GrpcByteSource source(buffer);
int ret = msg->Parse(&source);
if (ret != 0) {
result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
}
}
g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
return result;
}
};
} // namespace grpc
namespace paddle {
namespace operators {
namespace detail {
enum class GrpcMethod {
kSendVariable,
kGetVariable,
};
static const int kGrpcNumMethods =
static_cast<int>(GrpcMethod::kGetVariable) + 1;
inline const char* GrpcMethodName(GrpcMethod id) {
switch (id) {
case GrpcMethod::kSendVariable:
return "/sendrecv.SendRecvService/SendVariable";
case GrpcMethod::kGetVariable:
return "/sendrecv.SendRecvService/GetVariable";
}
// Shouldn't be reached.
PADDLE_ENFORCE(false, "Invalid id: not found valid method name");
return nullptr;
}
class GrpcService final {
public:
class AsyncService : public ::grpc::Service {
public:
AsyncService() {
for (int i = 0; i < kGrpcNumMethods; ++i) {
AddMethod(new ::grpc::internal::RpcServiceMethod(
GrpcMethodName(static_cast<GrpcMethod>(i)),
::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
::grpc::Service::MarkMethodAsync(i);
}
}
virtual ~AsyncService() {}
// Make RequestAsyncUnary public for grpc_call.h
using ::grpc::Service::RequestAsyncUnary;
};
};
} // namespace detail
} // namespace operator
} // namespace paddle
...@@ -32,6 +32,9 @@ enum VarType { ...@@ -32,6 +32,9 @@ enum VarType {
SELECTED_ROWS = 1; SELECTED_ROWS = 1;
} }
// NOTICE(gongwb):don't modify this proto if you are not
// not familar with how we serialize in sendrecvop_utils.h
// and deserilize it in variable_response.h.
message VariableMessage { message VariableMessage {
enum Type { enum Type {
// Pod Types // Pod Types
...@@ -45,7 +48,6 @@ message VariableMessage { ...@@ -45,7 +48,6 @@ message VariableMessage {
} }
message LodData { repeated int64 lod_data = 1; } message LodData { repeated int64 lod_data = 1; }
string varname = 1; string varname = 1;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
VarType type = 2; VarType type = 2;
...@@ -64,3 +66,5 @@ message VariableMessage { ...@@ -64,3 +66,5 @@ message VariableMessage {
} }
message VoidMessage {} message VoidMessage {}
message TestMessage { int64 test_1 = 1; }
...@@ -13,61 +13,19 @@ See the License for the specific language governing permissions and ...@@ -13,61 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include <sys/time.h>
#include <thread>
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h" #include "paddle/fluid/operators/detail/proto_encoder_helper.h"
#include "paddle/fluid/operators/detail/variable_response.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
void SerializeToMessage(const std::string& name, const framework::Variable* var,
const platform::DeviceContext& ctx,
sendrecv::VariableMessage* msg) {
msg->set_varname(name);
std::ostringstream oss;
switch (framework::ToVarType(var->Type())) {
case framework::proto::VarType_Type_LOD_TENSOR:
msg->set_type(sendrecv::VarType::LOD_TENSOR);
framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
break;
case framework::proto::VarType_Type_SELECTED_ROWS:
msg->set_type(sendrecv::VarType::SELECTED_ROWS);
framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
ctx);
break;
default: {
PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name());
break;
}
}
msg->set_serialized(oss.str());
}
void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
const platform::DeviceContext& ctx,
framework::Variable* var) {
std::istringstream iss(msg.serialized());
switch (msg.type()) {
case sendrecv::VarType::LOD_TENSOR:
DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
break;
case sendrecv::VarType::SELECTED_ROWS: {
DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
ctx);
break;
}
default: {
PADDLE_THROW("Deserialize does not support type: %s",
typeid(var->Type()).name());
break;
}
}
}
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg) { ::grpc::ByteBuffer* msg) {
...@@ -123,6 +81,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -123,6 +81,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
static_cast<const platform::CUDADeviceContext&>(ctx); static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor.memory_size(); auto copy_size = tensor.memory_size();
payload = memory::Alloc(cpu, copy_size); payload = memory::Alloc(cpu, copy_size);
memory::Copy(cpu, payload, memory::Copy(cpu, payload,
boost::get<platform::CUDAPlace>(tensor.place()), boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(tensor.data<void>()), reinterpret_cast<const void*>(tensor.data<void>()),
...@@ -132,6 +91,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -132,6 +91,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
platform::CPUPlace cpu; platform::CPUPlace cpu;
memory::Free(cpu, backing); memory::Free(cpu, backing);
}; };
#endif #endif
} else { } else {
payload = tensor.data<void>(); payload = tensor.data<void>();
...@@ -219,80 +179,11 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -219,80 +179,11 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
framework::Variable* var) { const framework::Scope* scope,
sendrecv::VariableMessage meta; framework::Variable*& var) {
GrpcByteBufferSource source; operators::detail::VariableResponse resp(scope, &ctx);
source.Init(msg); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
::google::protobuf::io::CodedInputStream input(&source); var = resp.GetVar();
// do zerocopy parsing
PADDLE_ENFORCE(meta.ParseFromCodedStream(&input));
PADDLE_ENFORCE(input.ConsumedEntireMessage());
// dims is needed by both tensor and selectedrows
std::vector<int> vecdims;
for (auto& d : meta.dims()) {
vecdims.push_back(d);
}
framework::DDim dims = framework::make_ddim(vecdims);
if (meta.type() == sendrecv::LOD_TENSOR) {
auto* tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(dims);
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta.data_type()));
framework::LoD lod;
for (int i = 0; i < meta.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) {
v.push_back(meta.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
// How to avoid copying and use the message buffer directly?
// Maybe need to find a way to release all memory except tensor content.
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
tensor_data, cpu,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size(), gpu_dev_ctx.stream());
ctx.Wait();
#endif
} else {
memcpy(tensor_data,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size());
}
} else if (meta.type() == sendrecv::SELECTED_ROWS) {
auto* slr = var->GetMutable<framework::SelectedRows>();
auto* tensor = slr->mutable_value();
int64_t* rows_data = slr->mutable_rows()->data();
tensor->Resize(dims);
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta.data_type()));
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
tensor_data, cpu,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size(), gpu_dev_ctx.stream());
ctx.Wait();
#endif
} else {
memcpy(tensor_data,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size());
}
// copy rows CPU data, GPU data will be copied lazly
memcpy(rows_data, reinterpret_cast<const void*>(meta.rows().data()),
meta.rows().size());
}
} }
} // namespace detail } // namespace detail
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
...@@ -36,21 +37,14 @@ namespace detail { ...@@ -36,21 +37,14 @@ namespace detail {
typedef void (*DestroyCallback)(void*); typedef void (*DestroyCallback)(void*);
void SerializeToMessage(const std::string& name, const framework::Variable* var,
const platform::DeviceContext& ctx,
sendrecv::VariableMessage* msg);
void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
const platform::DeviceContext& ctx,
framework::Variable* var);
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg); ::grpc::ByteBuffer* msg);
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
framework::Variable* var); const framework::Scope* scope,
framework::Variable*& var);
inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
switch (type) { switch (type) {
......
...@@ -16,11 +16,13 @@ limitations under the License. */ ...@@ -16,11 +16,13 @@ limitations under the License. */
#include <string> #include <string>
#include <thread> #include <thread>
#include <google/protobuf/text_format.h>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/variable_response.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -31,19 +33,21 @@ namespace operators = paddle::operators; ...@@ -31,19 +33,21 @@ namespace operators = paddle::operators;
namespace math = paddle::operators::math; namespace math = paddle::operators::math;
namespace memory = paddle::memory; namespace memory = paddle::memory;
void RunSerdeTestTensor(platform::Place place) { void RunSerdeTestSelectedRows(platform::Place place) {
// serialize var to ByteBuffer
framework::Variable var;
auto* tensor = var.GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
framework::LoD lod;
lod.push_back(framework::Vector<size_t>({1, 3, 8}));
tensor->set_lod(lod);
int tensor_numel = 4 * 8 * 4 * 2;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
// serialize var to ByteBuffer
framework::Variable var;
auto* slr = var.GetMutable<framework::SelectedRows>();
auto* tensor = slr->mutable_value();
auto* rows = slr->mutable_rows();
tensor->Resize(framework::make_ddim({2, 10}));
tensor->mutable_data<float>(place); tensor->mutable_data<float>(place);
math::set_constant(ctx, tensor, 31.9); int tensor_numel = 2 * 10;
math::set_constant(ctx, tensor, 32.7);
rows->push_back(3);
rows->push_back(10);
::grpc::ByteBuffer msg; ::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
...@@ -56,62 +60,67 @@ void RunSerdeTestTensor(platform::Place place) { ...@@ -56,62 +60,67 @@ void RunSerdeTestTensor(platform::Place place) {
for (const auto& s : slices) { for (const auto& s : slices) {
tmp.append(reinterpret_cast<const char*>(s.begin()), s.size()); tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
} }
sendrecv::VariableMessage varmsg; sendrecv::VariableMessage varmsg;
EXPECT_TRUE(varmsg.ParseFromString(tmp)); EXPECT_TRUE(varmsg.ParseFromString(tmp));
EXPECT_EQ(varmsg.varname(), "myvar"); EXPECT_EQ(varmsg.varname(), "myvar");
EXPECT_EQ(varmsg.type(), 0); EXPECT_EQ(varmsg.type(), 1);
EXPECT_EQ(varmsg.dims()[0], 4);
EXPECT_EQ(varmsg.dims()[1], 8);
EXPECT_EQ(varmsg.dims()[2], 4);
EXPECT_EQ(varmsg.dims()[3], 2);
EXPECT_EQ(varmsg.lod_level(), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
const float* tensor_data = const float* tensor_data =
reinterpret_cast<const float*>(varmsg.serialized().data()); reinterpret_cast<const float*>(varmsg.serialized().data());
const int64_t* rows_data =
reinterpret_cast<const int64_t*>(varmsg.rows().data());
for (int i = 0; i < tensor_numel; ++i) { for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data[i], 31.9); EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
} }
EXPECT_EQ(rows_data[0], 3);
EXPECT_EQ(rows_data[1], 10);
// deserialize zero-copy // deserialize zero-copy
framework::Variable var2; // framework::Variable var2;
operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
auto tensor2 = var2.Get<framework::LoDTensor>(); framework::Scope scope;
scope.Var("myvar");
operators::detail::VariableResponse resp(&scope, &ctx);
EXPECT_EQ(resp.Parse(msg), 0);
framework::Variable* var2 = resp.GetVar();
auto* slr2 = var2->GetMutable<framework::SelectedRows>();
auto* tensor2 = slr2->mutable_value();
auto* rows2 = slr2->mutable_rows();
float* tensor_data2 = nullptr; float* tensor_data2 = nullptr;
framework::Tensor tmp_tensor; framework::Tensor tmp_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
platform::CPUPlace cpu; platform::CPUPlace cpu;
framework::TensorCopy(tensor2, cpu, &tmp_tensor); framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
tensor_data2 = tmp_tensor.data<float>(); tensor_data2 = tmp_tensor.data<float>();
} else { } else {
tensor_data2 = const_cast<float*>(tensor2.data<float>()); tensor_data2 = const_cast<float*>(tensor2->data<float>());
} }
const int64_t* rows_data2 = rows2->data();
EXPECT_EQ(varmsg.lod_level(), 1); for (int i = 0; i < tensor_numel; ++i) {
EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); }
EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); EXPECT_EQ(rows_data2[0], 3);
for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); EXPECT_EQ(rows_data2[1], 10);
} }
void RunSerdeTestSelectedRows(platform::Place place) { void RunTestLodTensor(platform::Place place, int from_type = 0) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// serialize var to ByteBuffer // serialize var to ByteBuffer
framework::Variable var; framework::Variable var;
auto* slr = var.GetMutable<framework::SelectedRows>(); auto* tensor = var.GetMutable<framework::LoDTensor>();
auto* tensor = slr->mutable_value(); tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
auto* rows = slr->mutable_rows(); framework::LoD lod;
tensor->Resize(framework::make_ddim({2, 10})); lod.push_back(framework::Vector<size_t>({1, 3, 8}));
tensor->set_lod(lod);
int tensor_numel = 4 * 8 * 4 * 2;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
tensor->mutable_data<float>(place); tensor->mutable_data<float>(place);
int tensor_numel = 2 * 10; math::set_constant(ctx, tensor, 31.9);
math::set_constant(ctx, tensor, 32.7);
rows->push_back(3);
rows->push_back(10);
::grpc::ByteBuffer msg; ::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
...@@ -126,61 +135,82 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -126,61 +135,82 @@ void RunSerdeTestSelectedRows(platform::Place place) {
} }
sendrecv::VariableMessage varmsg; sendrecv::VariableMessage varmsg;
EXPECT_TRUE(varmsg.ParseFromString(tmp)); EXPECT_TRUE(varmsg.ParseFromString(tmp));
EXPECT_EQ(varmsg.varname(), "myvar"); EXPECT_EQ(varmsg.varname(), "myvar");
EXPECT_EQ(varmsg.type(), 1); EXPECT_EQ(varmsg.type(), 0);
EXPECT_EQ(varmsg.dims()[0], 4);
EXPECT_EQ(varmsg.dims()[1], 8);
EXPECT_EQ(varmsg.dims()[2], 4);
EXPECT_EQ(varmsg.dims()[3], 2);
EXPECT_EQ(varmsg.lod_level(), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
const float* tensor_data = const float* tensor_data =
reinterpret_cast<const float*>(varmsg.serialized().data()); reinterpret_cast<const float*>(varmsg.serialized().data());
const int64_t* rows_data =
reinterpret_cast<const int64_t*>(varmsg.rows().data());
for (int i = 0; i < tensor_numel; ++i) { for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data[i], 32.7); EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
} }
EXPECT_EQ(rows_data[0], 3);
EXPECT_EQ(rows_data[1], 10); // message binary
std::string str;
varmsg.SerializeToString(&str);
// message bytebuffer
::grpc::Slice slices_2[1];
int num_slices = 1;
slices_2[0] = ::grpc::Slice(str.length());
memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
// deserialize zero-copy // deserialize zero-copy
framework::Variable var2; framework::Scope scope;
operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); scope.Var("myvar");
operators::detail::VariableResponse resp(&scope, &ctx);
if (from_type == 0) {
EXPECT_EQ(resp.Parse(msg), 0);
} else {
EXPECT_EQ(resp.Parse(bytebuffer2), 0);
}
auto* slr2 = var2.GetMutable<framework::SelectedRows>(); framework::Variable* var2 = resp.GetVar();
auto* tensor2 = slr2->mutable_value();
auto* rows2 = slr2->mutable_rows(); auto tensor2 = var2->Get<framework::LoDTensor>();
float* tensor_data2 = nullptr; float* tensor_data2 = nullptr;
framework::Tensor tmp_tensor; framework::Tensor tmp_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
platform::CPUPlace cpu; platform::CPUPlace cpu;
framework::TensorCopy(*tensor2, cpu, &tmp_tensor); framework::TensorCopy(tensor2, cpu, &tmp_tensor);
tensor_data2 = tmp_tensor.data<float>(); tensor_data2 = tmp_tensor.data<float>();
} else { } else {
tensor_data2 = const_cast<float*>(tensor2->data<float>()); tensor_data2 = const_cast<float*>(tensor2.data<float>());
} }
const int64_t* rows_data2 = rows2->data();
for (int i = 0; i < tensor_numel; ++i) { EXPECT_EQ(varmsg.lod_level(), 1);
EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
} EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
EXPECT_EQ(rows_data2[0], 3); EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
EXPECT_EQ(rows_data2[1], 10); for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
} }
TEST(SelectedRows, CPU) { TEST(LodTensor, Run) {
platform::CPUPlace place; platform::CPUPlace place;
RunSerdeTestSelectedRows(place); RunTestLodTensor(place);
RunTestLodTensor(place, 1);
#ifdef PADDLE_WITH_CUDA
platform::CUDAPlace gpu(0);
RunTestLodTensor(gpu);
RunTestLodTensor(gpu, 1);
#endif
} }
TEST(SelectedRows, GPU) { TEST(SelectedRows, Run) {
platform::CUDAPlace place; platform::CPUPlace place;
RunSerdeTestSelectedRows(place); RunSerdeTestSelectedRows(place);
}
TEST(Tensor, CPU) { #ifdef PADDLE_WITH_CUDA
platform::CPUPlace place; platform::CUDAPlace gpu;
RunSerdeTestTensor(place); RunSerdeTestSelectedRows(gpu);
#endif
} }
TEST(Tensor, GPU) {
platform::CUDAPlace place;
RunSerdeTestTensor(place);
}
\ No newline at end of file
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/detail/variable_response.h"
#include <string.h>
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
namespace paddle {
namespace operators {
namespace detail {
enum WireType {
WIRETYPE_VARINT = 0,
WIRETYPE_LENGTH_DELIMITED = 2,
};
inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
inline WireType GetTagWireType(uint32_t tag) {
return static_cast<WireType>(tag & 0x7);
}
bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
int* result) {
uint64_t v;
if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
*result = static_cast<int>(v);
return true;
} else {
return false;
}
}
bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& dev_ctx, platform::Place place,
void* dest, int size) {
const void* data = NULL;
int size_to_write = 0;
if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
char* p = reinterpret_cast<char*>(dest);
while (size > 0) {
if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
return false;
}
memory::Copy(boost::get<platform::CUDAPlace>(place),
reinterpret_cast<void*>(p), cpu, data, size_to_write,
gpu_dev_ctx.stream());
p += size_to_write;
size -= size_to_write;
input->Skip(size_to_write);
}
gpu_dev_ctx.Wait();
#else
PADDLE_THROW("Unexpected branch");
#endif
return true;
}
char* p = reinterpret_cast<char*>(dest);
while (size > 0) {
if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
return false;
}
// TODO(gongwb): can we avoid copy?
platform::CPUPlace cpu;
memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
p += size_to_write;
size -= size_to_write;
input->Skip(size_to_write);
}
return true;
}
bool VariableResponse::CopyLodTensorData(
::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
auto var = scope_->FindVar(meta_.varname());
auto* tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(dims);
framework::LoD lod;
for (int i = 0; i < meta_.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
v.push_back(meta_.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
void* tensor_data =
tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
return false;
}
return true;
}
inline framework::DDim GetDims(
const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
std::vector<int> vecdims;
for (auto& d : dims) {
vecdims.push_back(d);
}
return framework::make_ddim(vecdims);
}
bool VariableResponse::CopySelectRowsTensorData(
::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
auto var = scope_->FindVar(meta_.varname());
auto* slr = var->GetMutable<framework::SelectedRows>();
auto* tensor = slr->mutable_value();
tensor->Resize(dims);
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta_.data_type()));
if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
return false;
}
return true;
}
bool VariableResponse::CopySelectRowsData(
::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, int length) {
auto var = scope_->FindVar(meta_.varname());
auto* slr = var->GetMutable<framework::SelectedRows>();
int64_t* rows_data = slr->mutable_rows()->data();
// copy rows CPU data, GPU data will be copied lazily.
platform::CPUPlace cpu;
if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
return false;
}
return true;
}
bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
std::vector<int64_t>* lod) {
while (true) {
auto p = input->ReadTagWithCutoff(127);
int tag = GetTagFieldNumber(p.first);
WireType wt = GetTagWireType(p.first);
if (!p.second) {
return (tag == 0);
}
switch (tag) {
case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
uint64_t v;
if (wt == WIRETYPE_VARINT) {
if (!input->ReadVarint64(&v)) {
return false;
}
lod->push_back(v);
break;
}
if (wt == WIRETYPE_LENGTH_DELIMITED) {
int length = 0;
if (!input->ReadVarintSizeAsInt(&length)) {
return tag;
}
for (int i = 0; i < length; i++) {
uint64_t v;
if (!input->ReadVarint64(&v)) {
return false;
}
lod->push_back(v);
}
break;
}
return false;
}
default: { return false; }
}
}
return true;
}
int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
GrpcByteBufferSource source;
source.Init(byte_buffer);
GrpcByteBufferSourceWrapper r(&source);
return Parse(&r);
}
int VariableResponse::Parse(Source* source) {
::google::protobuf::io::ZeroCopyInputStream* input_stream =
source->contents();
::google::protobuf::io::CodedInputStream input(input_stream);
input.SetTotalBytesLimit(INT_MAX, INT_MAX);
while (true) {
auto p = input.ReadTagWithCutoff(127);
int tag = GetTagFieldNumber(p.first);
WireType wt = GetTagWireType(p.first);
if (!p.second) {
if (tag != 0) {
return -1;
}
return 0;
}
switch (tag) {
case sendrecv::VariableMessage::kVarnameFieldNumber: {
uint32_t length;
if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
return tag;
}
std::string temp;
if (!input.ReadString(&temp, length)) {
return tag;
}
meta_.set_varname(temp);
break;
}
case sendrecv::VariableMessage::kTypeFieldNumber: {
uint64_t v;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
return tag;
}
meta_.set_type(static_cast<::sendrecv::VarType>(v));
break;
}
case sendrecv::VariableMessage::kDataTypeFieldNumber: {
uint64_t v = 0;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
return tag;
}
meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
break;
}
case sendrecv::VariableMessage::kDimsFieldNumber: {
// not packed
if (wt == WIRETYPE_VARINT) {
uint64_t v;
if (!input.ReadVarint64(&v)) {
return tag;
}
meta_.add_dims(v);
break;
}
// packed
if (wt == WIRETYPE_LENGTH_DELIMITED) {
int length = 0;
if (!input.ReadVarintSizeAsInt(&length)) {
return tag;
}
for (int i = 0; i < length; i++) {
uint64_t v;
if (!input.ReadVarint64(&v)) {
return tag;
}
meta_.add_dims(v);
}
break;
}
return tag;
}
case sendrecv::VariableMessage::kLodLevelFieldNumber: {
uint64_t v = 0;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
return tag;
}
meta_.set_lod_level(static_cast<int64_t>(v));
break;
}
case sendrecv::VariableMessage::kLodFieldNumber: {
int length = 0;
if (wt != WIRETYPE_LENGTH_DELIMITED ||
!ReadVarintSizeAsInt(&input, &length)) {
return tag;
}
std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
input.IncrementRecursionDepthAndPushLimit(length);
std::vector<int64_t> lod_data;
if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
return tag;
}
if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
return false;
}
if (lod_data.size() == 0) {
break;
}
auto lod = meta_.add_lod();
for (uint32_t i = 0; i < lod_data.size(); i++) {
lod->add_lod_data(lod_data[i]);
}
break;
}
case sendrecv::VariableMessage::kSerializedFieldNumber: {
PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
meta_.type() == sendrecv::LOD_TENSOR) &&
meta_.varname() != "",
"meta info should be got first!");
int length = 0;
if (wt != WIRETYPE_LENGTH_DELIMITED ||
!ReadVarintSizeAsInt(&input, &length)) {
return tag;
}
framework::DDim dims = GetDims(meta_.dims());
if (meta_.type() == sendrecv::LOD_TENSOR) {
PADDLE_ENFORCE(meta_.lod_size() >= 0,
"lod info should be got first!");
if (!CopyLodTensorData(&input, *dev_ctx_, dims, length)) {
return tag;
}
break;
}
if (meta_.type() == sendrecv::SELECTED_ROWS) {
if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, length)) {
return tag;
}
break;
}
return tag;
}
case sendrecv::VariableMessage::kRowsFieldNumber: {
PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
meta_.type() == sendrecv::LOD_TENSOR) &&
meta_.varname() != "",
"meta info should be got first!");
int length = 0;
if (wt != WIRETYPE_LENGTH_DELIMITED ||
!ReadVarintSizeAsInt(&input, &length)) {
return tag;
}
if (!CopySelectRowsData(&input, *dev_ctx_, length)) {
return tag;
}
break;
}
default: {
// Unknown tag, return unknown error.
return -1;
}
}
}
return 0;
}
}; // namespace detail
}; // namespace operators
}; // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
namespace paddle {
namespace operators {
namespace detail {
class VariableResponse {
public:
VariableResponse(const framework::Scope* scope,
const platform::DeviceContext* dev_ctx)
: scope_(scope), dev_ctx_(dev_ctx) {}
virtual ~VariableResponse() {}
// return:
// 0:ok.
// -1: unkown error.
// other: number of error field.
int Parse(Source* source);
// return:
// 0:ok.
// -1: unkown error.
// other: number of error field.
int Parse(const ::grpc::ByteBuffer& byte_buffer);
inline std::string Varname() { return meta_.varname(); }
// should call parse first.
framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }
private:
bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx,
framework::DDim& dims, int length);
bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, int length);
bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx,
framework::DDim& dims, int length);
private:
const framework::Scope* scope_;
const platform::DeviceContext* dev_ctx_;
// only Skeleton
sendrecv::VariableMessage meta_;
};
}; // namespace detail
}; // namespace operators
}; // namespace paddle
...@@ -188,8 +188,8 @@ The general steps are as follows. First, calculate the true positive and ...@@ -188,8 +188,8 @@ The general steps are as follows. First, calculate the true positive and
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(detection_map, ops::DetectionMAPOp, REGISTER_OPERATOR(detection_map, ops::DetectionMAPOp, ops::DetectionMAPOpMaker,
ops::DetectionMAPOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
detection_map, ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, float>, detection_map, ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, float>,
ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, double>); ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, double>);
...@@ -35,7 +35,6 @@ class DropoutOp : public framework::OperatorWithKernel { ...@@ -35,7 +35,6 @@ class DropoutOp : public framework::OperatorWithKernel {
} }
}; };
template <typename AttrType>
class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker) DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
...@@ -73,7 +72,6 @@ are set equal to their corresponding inputs. ...@@ -73,7 +72,6 @@ are set equal to their corresponding inputs.
} }
}; };
template <typename AttrType>
class DropoutOpGrad : public framework::OperatorWithKernel { class DropoutOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -103,11 +101,10 @@ class DropoutOpGrad : public framework::OperatorWithKernel { ...@@ -103,11 +101,10 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad, REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
ops::DropoutOpGrad<float>); ops::DropoutOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
dropout, dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
dropout_grad, dropout_grad,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>); ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
...@@ -18,20 +18,22 @@ limitations under the License. */ ...@@ -18,20 +18,22 @@ limitations under the License. */
#include <thrust/random.h> #include <thrust/random.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T, typename AttrType> template <typename T>
__global__ void RandomGenerator(const size_t n, const int seed, __global__ void RandomGenerator(const size_t n, const int seed,
const AttrType dropout_prob, const T* src, const float dropout_prob, const T* src,
T* mask_data, T* dst) { T* mask_data, T* dst) {
thrust::minstd_rand rng; thrust::minstd_rand rng;
rng.seed(seed); rng.seed(seed);
thrust::uniform_real_distribution<AttrType> dist(0, 1); thrust::uniform_real_distribution<float> dist(0, 1);
int idx = blockDim.x * blockIdx.x + threadIdx.x; int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < n; idx += blockDim.x * gridDim.x) { for (; idx < n; idx += blockDim.x * gridDim.x) {
rng.discard(idx);
if (dist(rng) < dropout_prob) { if (dist(rng) < dropout_prob) {
mask_data[idx] = static_cast<T>(0); mask_data[idx] = static_cast<T>(0);
} else { } else {
...@@ -44,17 +46,14 @@ __global__ void RandomGenerator(const size_t n, const int seed, ...@@ -44,17 +46,14 @@ __global__ void RandomGenerator(const size_t n, const int seed,
// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to // Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random. // implement uniform random.
template <typename Place, typename T, typename AttrType> template <typename Place, typename T>
class GPUDropoutKernel : public framework::OpKernel<T> { class GPUDropoutKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X"); auto* x = context.Input<Tensor>("X");
auto* y = context.Output<Tensor>("Out"); auto* y = context.Output<Tensor>("Out");
y->mutable_data<T>(context.GetPlace()); y->mutable_data<T>(context.GetPlace());
AttrType dropout_prob = context.Attr<AttrType>("dropout_prob"); float dropout_prob = context.Attr<float>("dropout_prob");
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto& place = *context.template device_context<Place>().eigen_device(); auto& place = *context.template device_context<Place>().eigen_device();
if (!context.Attr<bool>("is_test")) { if (!context.Attr<bool>("is_test")) {
...@@ -70,11 +69,13 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -70,11 +69,13 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
int threads = 512; int threads = 512;
int grid = (x->numel() + threads - 1) / threads; int grid = (x->numel() + threads - 1) / threads;
RandomGenerator<T, AttrType><<<grid, threads, 0, RandomGenerator<
context.cuda_device_context().stream()>>>( T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, seed, dropout_prob, x_data, mask_data, y_data); size, seed, dropout_prob, x_data, mask_data, y_data);
} else { } else {
Y.device(place) = X * (1.0f - dropout_prob); auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
} }
} }
}; };
...@@ -83,9 +84,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -83,9 +84,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
dropout, dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>); ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(dropout_grad,
dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>);
ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
...@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor, ...@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T, typename AttrType> template <typename DeviceContext, typename T>
class CPUDropoutKernel : public framework::OpKernel<T> { class CPUDropoutKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(dropout);
void Compare(f::Scope& scope, p::DeviceContext& ctx) {
// init
auto var = scope.Var("X");
auto tensor = var->GetMutable<f::LoDTensor>();
tensor->Resize({10, 10});
std::vector<float> init;
for (int64_t i = 0; i < 10 * 10; ++i) {
init.push_back(1.0);
}
TensorFromVector(init, ctx, tensor);
auto place = ctx.GetPlace();
auto out_var = scope.Var("Out");
auto out_tensor = out_var->GetMutable<f::LoDTensor>();
out_tensor->Resize({10, 10});
out_tensor->mutable_data<float>(place); // allocate
auto mask_var = scope.Var("Mask");
auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
mask_tensor->Resize({10, 10});
mask_tensor->mutable_data<float>(place); // allocate
// run
f::AttributeMap attrs;
float dropout_prob = 0.5;
attrs.insert({"fix_seed", 1});
attrs.insert({"seed", 3});
attrs.insert({"dropout_prob", dropout_prob});
auto dropout_op = f::OpRegistry::CreateOp(
"dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
dropout_op->Run(scope, place);
std::vector<float> out_vec;
TensorToVector(*out_tensor, ctx, &out_vec);
std::vector<float> std_out = {
0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1};
EXPECT_EQ(out_vec.size(), std_out.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], std_out[i]);
}
}
TEST(Dropout, CPUDense) {
f::Scope scope;
p::CPUPlace place;
p::CPUDeviceContext ctx(place);
Compare(scope, ctx);
}
TEST(Dropout, GPUDense) {
f::Scope scope;
p::CUDAPlace place;
p::CUDADeviceContext ctx(place);
Compare(scope, ctx);
}
...@@ -14,19 +14,20 @@ limitations under the License. */ ...@@ -14,19 +14,20 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_add, elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_add_grad, elementwise_add_grad,
ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
int64_t>);
...@@ -87,8 +87,9 @@ $$ ...@@ -87,8 +87,9 @@ $$
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp, REGISTER_OPERATOR(iou_similarity, ops::IOUSimilarityOp,
ops::IOUSimilarityOpMaker); ops::IOUSimilarityOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
iou_similarity, iou_similarity,
......
...@@ -22,6 +22,103 @@ limitations under the License. */ ...@@ -22,6 +22,103 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// Wrap RowwiseMean and ColwiseMean.
// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
// implementation only considers 2D.
template <typename DeviceContext, typename T>
struct RowwiseMean2D {
RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor* vec);
};
#ifdef PADDLE_WITH_CUDA
template <typename T>
class RowwiseMean2D<platform::CUDADeviceContext, T> {
public:
RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
: left_(left), right_(right) {
framework::DDim ones_dim({right_});
divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
math::set_constant(dev_ctx, &divisor_, 1.0 / right);
}
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
math::gemv<platform::CUDADeviceContext, T>(
context, false, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
0., out->data<T>());
}
private:
int left_;
int right_;
framework::Tensor divisor_;
};
#endif
template <typename T>
class RowwiseMean2D<platform::CPUDeviceContext, T> {
public:
RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
row_mean_(context, input, out);
}
private:
math::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
};
template <typename DeviceContext, typename T>
struct ColwiseSum2D {
ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor* vec);
};
#ifdef PADDLE_WITH_CUDA
template <typename T>
class ColwiseSum2D<platform::CUDADeviceContext, T> {
public:
ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
: left_(left), right_(right) {
framework::DDim ones_dim({left_});
divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
math::set_constant(dev_ctx, &divisor_, 1.0);
}
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
math::gemv<platform::CUDADeviceContext, T>(
context, true, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
0., out->data<T>());
}
private:
int left_;
int right_;
framework::Tensor divisor_;
};
#endif
template <typename T>
class ColwiseSum2D<platform::CPUDeviceContext, T> {
public:
ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
col_wise_(context, input, out);
}
private:
math::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
};
template <typename T> template <typename T>
struct SubAndSquareFunctor { struct SubAndSquareFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
...@@ -67,15 +164,15 @@ using DataLayout = framework::DataLayout; ...@@ -67,15 +164,15 @@ using DataLayout = framework::DataLayout;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LayerNormKernel : public framework::OpKernel<T> { class LayerNormKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
auto *scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias"); auto* bias = ctx.Input<Tensor>("Bias");
auto x = *ctx.Input<Tensor>("X"); auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Y"); auto* y = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean"); auto* mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance"); auto* var = ctx.Output<Tensor>("Variance");
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis"); const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const auto x_dims = x.dims(); const auto x_dims = x.dims();
...@@ -94,8 +191,8 @@ class LayerNormKernel : public framework::OpKernel<T> { ...@@ -94,8 +191,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
out.ShareDataWith(*y); out.ShareDataWith(*y);
out.Resize(matrix_shape); out.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::RowwiseMean<DeviceContext, T> row_mean; RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
// get mean // get mean
row_mean(dev_ctx, x, mean); row_mean(dev_ctx, x, mean);
...@@ -126,31 +223,32 @@ class LayerNormKernel : public framework::OpKernel<T> { ...@@ -126,31 +223,32 @@ class LayerNormKernel : public framework::OpKernel<T> {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LayerNormGradKernel : public framework::OpKernel<T> { class LayerNormGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
auto x = *ctx.Input<Tensor>("X"); auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto *mean = ctx.Input<Tensor>("Mean"); auto* mean = ctx.Input<Tensor>("Mean");
auto *var = ctx.Input<Tensor>("Variance"); auto* var = ctx.Input<Tensor>("Variance");
auto *scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias"); auto* bias = ctx.Input<Tensor>("Bias");
auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y")); auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis"); const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
// init output // init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X")); auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale")); auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias")); auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
const auto &x_dims = x.dims(); const auto& x_dims = x.dims();
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]); int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]); int right = static_cast<int>(matrix_dim[1]);
framework::DDim matrix_shape({left, right}); framework::DDim matrix_shape({left, right});
d_y.Resize(matrix_shape); d_y.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::ColwiseSum<DeviceContext, T> colwise_sum; ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
ctx.device_context());
Tensor temp; Tensor temp;
Tensor temp_norm; Tensor temp_norm;
...@@ -190,7 +288,8 @@ class LayerNormGradKernel : public framework::OpKernel<T> { ...@@ -190,7 +288,8 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
Tensor temp_vec; Tensor temp_vec;
temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace()); temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
math::RowwiseMean<DeviceContext, T> row_mean; RowwiseMean2D<DeviceContext, T> row_mean(left, right,
ctx.device_context());
if (d_scale) { if (d_scale) {
// dy_dx // dy_dx
......
...@@ -24,6 +24,7 @@ limitations under the License. */ ...@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/framework/proto_desc.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/grpc_server.h" #include "paddle/fluid/operators/detail/grpc_server.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/simple_block_queue.h" #include "paddle/fluid/operators/detail/simple_block_queue.h"
...@@ -68,9 +69,7 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -68,9 +69,7 @@ class ListenAndServOp : public framework::OperatorBase {
} }
void Stop() override { void Stop() override {
detail::MessageWithName term_msg; rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
term_msg.first = LISTEN_TERMINATE_MESSAGE;
rpc_service_->Push(term_msg);
rpc_service_->ShutDown(); rpc_service_->ShutDown();
server_thread_->join(); server_thread_->join();
} }
...@@ -89,6 +88,10 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -89,6 +88,10 @@ class ListenAndServOp : public framework::OperatorBase {
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock); auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *program = block->Program(); auto *program = block->Program();
int num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks");
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
// TODO(typhoonzero): change this to a while_op for every cluster-batch. // TODO(typhoonzero): change this to a while_op for every cluster-batch.
...@@ -103,7 +106,7 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -103,7 +106,7 @@ class ListenAndServOp : public framework::OperatorBase {
size_t recv_var_cnt = 0; size_t recv_var_cnt = 0;
int batch_barrier = 0; int batch_barrier = 0;
while (batch_barrier != fan_in) { while (batch_barrier != fan_in) {
const detail::MessageWithName &v = rpc_service_->Get(); const detail::ReceivedMessage v = rpc_service_->Get();
auto recv_var_name = v.first; auto recv_var_name = v.first;
if (recv_var_name == LISTEN_TERMINATE_MESSAGE) { if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
LOG(INFO) << "received terminate message and exit"; LOG(INFO) << "received terminate message and exit";
...@@ -116,12 +119,11 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -116,12 +119,11 @@ class ListenAndServOp : public framework::OperatorBase {
} else { } else {
VLOG(3) << "received grad: " << recv_var_name; VLOG(3) << "received grad: " << recv_var_name;
recv_var_cnt++; recv_var_cnt++;
auto *var = recv_scope.FindVar(recv_var_name); auto var = v.second->GetVar();
if (var == nullptr) { if (var == nullptr) {
LOG(ERROR) << "Can not find server side var: " << recv_var_name; LOG(ERROR) << "Can not find server side var: " << recv_var_name;
PADDLE_THROW("Can not find server side var"); PADDLE_THROW("Can not find server side var");
} }
detail::DeserializeFromMessage(v.second, dev_ctx, var);
if (var->IsType<framework::SelectedRows>()) { if (var->IsType<framework::SelectedRows>()) {
sparse_vars.push_back(var); sparse_vars.push_back(var);
} }
...@@ -132,12 +134,35 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -132,12 +134,35 @@ class ListenAndServOp : public framework::OperatorBase {
rpc_service_->ShutDown(); rpc_service_->ShutDown();
break; break;
} }
try {
executor.Run(*program, &recv_scope, block->ID(), /*global_block*/ // put optimize blocks in the thread pool to start run, the last block
false /*create_local_scope*/, false /*create_vars*/); // should be global ops.
} catch (std::exception &e) { // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
LOG(ERROR) << "run sub program error " << e.what(); // and this will still work.
std::vector<std::future<void>> fs;
// block0 contains only listen_and_serv op, start run from block1.
for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
fs.push_back(
framework::Async([&executor, &program, &recv_scope, blkid]() {
int run_block = blkid; // thread local
try {
executor.Run(*program, &recv_scope, run_block, false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
}));
}
for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait();
// Run global block at final step, or block1 if there are only 2 blocks
if (num_blocks >= 2) {
try {
executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
} }
// Reset the received sparse variables, the sum operator would not // Reset the received sparse variables, the sum operator would not
// sum the input sparse variables which rows is empty at the next // sum the input sparse variables which rows is empty at the next
// mini-batch. // mini-batch.
...@@ -151,6 +176,10 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -151,6 +176,10 @@ class ListenAndServOp : public framework::OperatorBase {
rpc_service_->WaitClientGet(fan_in); rpc_service_->WaitClientGet(fan_in);
sparse_vars.clear(); sparse_vars.clear();
} // while(true) } // while(true)
// for (int i = 0; i < num_blocks; ++i) {
// delete blk_ctx_list[i];
// }
} }
protected: protected:
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/lrn_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
namespace {
template <typename T, typename... Args>
std::shared_ptr<T> insert_to_context(const std::string& key,
const MKLDNNDeviceContext& dev_ctx,
Args&&... args) {
auto p = std::static_pointer_cast<T, void>(dev_ctx.GetBlob(key));
if (!p) {
p = std::make_shared<T>(args...);
dev_ctx.SetBlob(key, std::static_pointer_cast<void, T>(p));
}
return p;
}
template <typename... Args>
void run_primitive(Args&&... args) {
auto forward_op = mkldnn::lrn_forward{args...};
std::vector<mkldnn::primitive> pipeline = {forward_op};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
} // namespace
template <typename T>
class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(std::is_same<T, float>::value,
"MKLDNN LRN must use float data.");
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"MKLDNN LRN must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto x = ctx.Input<Tensor>("X");
auto out = ctx.Output<Tensor>("Out");
auto mid = ctx.Output<Tensor>("MidOut");
auto input_data = x->data<T>();
auto output_data = out->mutable_data<T>(ctx.GetPlace());
mid->mutable_data<T>(ctx.GetPlace());
const int n = ctx.Attr<int>("n");
const float alpha = ctx.Attr<float>("alpha");
const float beta = ctx.Attr<float>("beta");
const float k = ctx.Attr<float>("k");
const bool is_test = ctx.Attr<bool>("is_test");
auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
e_mid = e_mid.constant(k);
auto dims = paddle::framework::vectorize2int(x->dims());
auto src_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto dst_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
mkldnn::lrn_across_channels,
src_md,
n,
alpha,
beta,
k};
auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
static_cast<void*>(output_data)};
if (!is_test) {
const std::string key = ctx.op().Output("Out");
const std::string key_src_memory = key + "@lrn_src_memory";
const std::string key_pd = key + "@lrn_pd";
const std::string key_workspace_memory = key + "@lrn_workspace_memory";
auto forward_pd = insert_to_context<mkldnn::lrn_forward::primitive_desc>(
key_pd, dev_ctx, forward_desc, mkldnn_engine);
auto src_memory = insert_to_context<mkldnn::memory>(
key_src_memory, dev_ctx, src_memory_pd);
src_memory->set_data_handle(
static_cast<void*>(const_cast<T*>(input_data)));
auto workspace_memory = insert_to_context<mkldnn::memory>(
key_workspace_memory, dev_ctx,
forward_pd->workspace_primitive_desc());
run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
} else {
auto forward_pd =
mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
auto src_memory = mkldnn::memory{
src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
auto workspace_memory =
mkldnn::memory{forward_pd.workspace_primitive_desc()};
run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
}
}
};
template <typename T>
class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(std::is_same<T, float>::value,
"MKLDNN LRN must use float data.");
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"MKLDNN LRN must use CPUPlace.");
PADDLE_ENFORCE(
!ctx.Attr<bool>("is_test"),
"is_test attribute should be set to False in training phase.");
auto x = ctx.Input<Tensor>("X");
auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
const std::string key = ctx.op().Input("Out");
const std::string key_src_memory = key + "@lrn_src_memory";
const std::string key_pd = key + "@lrn_pd";
const std::string key_workspace_memory = key + "@lrn_workspace_memory";
const int n = ctx.Attr<int>("n");
const float alpha = ctx.Attr<float>("alpha");
const float beta = ctx.Attr<float>("beta");
const float k = ctx.Attr<float>("k");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
auto out_grad_data = out_grad->data<T>();
auto dims = paddle::framework::vectorize2int(x->dims());
auto src_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto diff_src_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto diff_dst_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto diff_dst_memory =
mkldnn::memory{{diff_dst_md, mkldnn_engine},
static_cast<void*>(const_cast<float*>(out_grad_data))};
auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine},
static_cast<void*>(x_grad_data)};
auto backward_desc = mkldnn::lrn_backward::desc{
mkldnn::lrn_across_channels, src_md, diff_src_md, n, alpha, beta, k};
auto forward_pd = dev_ctx.GetBlob(key_pd);
auto backward_pd = mkldnn::lrn_backward::primitive_desc{
backward_desc, mkldnn_engine,
*static_cast<mkldnn::lrn_forward::primitive_desc*>(forward_pd.get())};
std::shared_ptr<void> workspace_memory =
dev_ctx.GetBlob(key_workspace_memory);
auto src_memory = dev_ctx.GetBlob(key_src_memory);
auto backward_op = mkldnn::lrn_backward{
backward_pd, *static_cast<mkldnn::memory*>(src_memory.get()),
diff_dst_memory, *static_cast<mkldnn::memory*>(workspace_memory.get()),
diff_src_memory};
std::vector<mkldnn::primitive> pipeline = {backward_op};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(lrn, MKLDNN, paddle::platform::CPUPlace,
ops::LRNMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(lrn_grad, MKLDNN, paddle::platform::CPUPlace,
ops::LRNMKLDNNGradOpKernel<float>);
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/lrn_op.h" #include "paddle/fluid/operators/lrn_op.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -116,6 +119,26 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> { ...@@ -116,6 +119,26 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
template struct LRNGradFunctor<platform::CPUDeviceContext, float>; template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
template struct LRNGradFunctor<platform::CPUDeviceContext, double>; template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
namespace {
framework::OpKernelType GetExpectedLRNKernel(
const framework::ExecutionContext& ctx) {
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout_, library_);
}
} // namespace
class LRNOp : public framework::OperatorWithKernel { class LRNOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -132,8 +155,13 @@ class LRNOp : public framework::OperatorWithKernel { ...@@ -132,8 +155,13 @@ class LRNOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4."); PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
ctx->SetOutputDim("Out", x_dim); ctx->SetOutputDim("Out", x_dim);
ctx->SetOutputDim("MidOut", x_dim);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
ctx->SetOutputDim("MidOut", x_dim);
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetExpectedLRNKernel(ctx);
} }
}; };
...@@ -176,6 +204,17 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -176,6 +204,17 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
"beta is the power number.") "beta is the power number.")
.SetDefault(0.75) .SetDefault(0.75)
.GreaterThan(0.0); .GreaterThan(0.0);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
"An optional string from: \"NHWC\", \"NCHW\". "
"Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ")
.SetDefault("AnyLayout");
AddAttr<bool>("is_test", "").SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Local Response Normalization Operator. Local Response Normalization Operator.
...@@ -223,8 +262,12 @@ class LRNOpGrad : public framework::OperatorWithKernel { ...@@ -223,8 +262,12 @@ class LRNOpGrad : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
} }
};
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetExpectedLRNKernel(ctx);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -6,6 +6,7 @@ function(math_library TARGET) ...@@ -6,6 +6,7 @@ function(math_library TARGET)
# But it handle split GPU/CPU code and link some common library. # But it handle split GPU/CPU code and link some common library.
set(cc_srcs) set(cc_srcs)
set(cu_srcs) set(cu_srcs)
set(hip_srcs)
set(math_common_deps device_context framework_proto) set(math_common_deps device_context framework_proto)
set(multiValueArgs DEPS) set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
...@@ -17,10 +18,15 @@ function(math_library TARGET) ...@@ -17,10 +18,15 @@ function(math_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_srcs ${TARGET}.hip.cu)
endif()
list(LENGTH cc_srcs cc_srcs_len) list(LENGTH cc_srcs cc_srcs_len)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_AMD_GPU)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif(${cc_srcs_len} GREATER 0) elseif(${cc_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
endif() endif()
...@@ -43,7 +49,7 @@ math_library(sequence2batch) ...@@ -43,7 +49,7 @@ math_library(sequence2batch)
math_library(sequence_padding) math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function) math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale) math_library(sequence_scale)
math_library(softmax) math_library(softmax DEPS math_function)
math_library(unpooling) math_library(unpooling)
math_library(vol2col) math_library(vol2col)
......
...@@ -20,7 +20,7 @@ namespace math { ...@@ -20,7 +20,7 @@ namespace math {
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatFunctor<platform::CPUDeviceContext, T> { class ConcatFunctor<platform::CPUDeviceContext, T> {
...@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> { ...@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CPUDeviceContext, T> { class ConcatGradFunctor<platform::CPUDeviceContext, T> {
......
...@@ -66,68 +66,66 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, ...@@ -66,68 +66,66 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
} }
template <typename T> template <typename T>
__global__ void KernelConcat(T** inputs, const int input_col, __global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
const int output_rows, const int output_cols, const int out_rows, const int out_cols,
T* output) { T* output_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col; for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { int split = tid_x * 1.0 / fixed_in_col;
int split = tid_x * inv_input_col; int in_offset = tid_x - split * fixed_in_col;
int in_offset = tid_x - split * input_col; T* input_ptr = inputs_data[split];
T* input_ptr = inputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) { for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
output[tid_y * output_cols + tid_x] = output_data[tid_y * out_cols + tid_x] =
input_ptr[tid_y * input_col + in_offset]; input_ptr[tid_y * fixed_in_col + in_offset];
} }
} }
} }
template <typename T> template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row, __global__ void KernelConcatGrad(const T* input_data, const int in_row,
const int input_col, const int* output_cols, const int in_col, const int* out_cols,
int col_size, T** outputs) { int out_cols_size, T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1; int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
int curr_offset = output_cols[segment]; int curr_offset = out_cols[segment];
int curr_segment = segment; int curr_segment = segment;
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset; T curr_col_offset;
while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) { while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
curr_offset = curr_col_offset; curr_offset = curr_col_offset;
++curr_segment; ++curr_segment;
} }
int local_col = tid_x - curr_offset; int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset; int segment_width = curr_col_offset - curr_offset;
T* output_ptr = outputs[curr_segment]; T* output_ptr = outputs_data[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * segment_width + local_col] = output_ptr[tid_y * segment_width + local_col] =
input[tid_y * input_col + tid_x]; input_data[tid_y * in_col + tid_x];
} }
} }
template <typename T> template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row, __global__ void KernelConcatGrad(const T* input_data, const int in_row,
const int input_col, const int output_cols, const int in_col, const int fixed_out_col,
T** outputs) { T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col; for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { int split = tid_x / fixed_out_col;
int split = tid_x * inv_input_col; int in_offset = tid_x - split * fixed_out_col;
int in_offset = tid_x - split * input_col; T* output_ptr = outputs_data[split];
T* output_ptr = outputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * output_cols + in_offset] = output_ptr[tid_y * fixed_out_col + in_offset] =
input[tid_y * input_col + tid_x]; input_data[tid_y * in_col + tid_x];
} }
} }
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatFunctor<platform::CUDADeviceContext, T> { class ConcatFunctor<platform::CUDADeviceContext, T> {
...@@ -136,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -136,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
const std::vector<framework::Tensor>& input, const int axis, const std::vector<framework::Tensor>& input, const int axis,
framework::Tensor* output) { framework::Tensor* output) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = input.size(); int in_num = input.size();
int rows = 1; int in_row = 1;
auto dim_0 = input[0].dims(); auto dim_0 = input[0].dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
rows *= dim_0[i]; in_row *= dim_0[i];
} }
int cols = input[0].numel() / rows; int in_col = input[0].numel() / in_row;
int out_rows = rows, out_cols = 0; int out_row = in_row, out_col = 0;
framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2); framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
framework::Vector<int> inputs_cols(num + 1); framework::Vector<int> inputs_col(in_num + 1);
inputs_cols[0] = 0;
T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data()); T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
inputs_col[0] = 0;
bool sameShape = true; bool sameShape = true;
for (int i = 0; i < num; ++i) { for (int i = 0; i < in_num; ++i) {
int t_cols = input[i].numel() / rows; int t_cols = input[i].numel() / in_row;
if (sameShape) { if (sameShape) {
if (t_cols != cols) sameShape = false; if (t_cols != in_col) sameShape = false;
} }
out_cols += t_cols; out_col += t_cols;
inputs_cols[i + 1] = out_cols; inputs_col[i + 1] = out_col;
inputs_ptr[i] = const_cast<T*>(input[i].data<T>()); inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
} }
T** ins_gpu = T** dev_ins_data =
reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace())); reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
// computation // computation
// set the thread block and grid according to CurrentDeviceId // set the thread block and grid according to CurrentDeviceId
const int kThreadsPerBlock = 1024; const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock; int block_cols = kThreadsPerBlock;
if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. if (out_col < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((out_cols + 31) >> 5) << 5; block_cols = ((out_col + 31) >> 5) << 5;
} }
int block_rows = kThreadsPerBlock / block_cols; int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1); dim3 block_size = dim3(block_cols, block_rows, 1);
...@@ -179,25 +176,26 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -179,25 +176,26 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols = int grid_cols =
std::min((out_cols + block_cols - 1) / block_cols, max_blocks); std::min((out_col + block_cols - 1) / block_cols, max_blocks);
int grid_rows = int grid_rows =
std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1); dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) { if (sameShape) {
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>( KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, cols, out_rows, out_cols, output->data<T>()); dev_ins_data, in_col, out_row, out_col, output->data<T>());
} else { } else {
const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>( KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows, dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
out_cols, output->data<T>()); out_row, out_col, output->data<T>());
} }
} }
}; };
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> { class ConcatGradFunctor<platform::CUDADeviceContext, T> {
...@@ -206,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -206,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& input, const int axis, const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) { std::vector<framework::Tensor>& outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = outputs.size(); int o_num = outputs.size();
int input_row = 1; int out_row = 1;
auto dim_0 = outputs[0].dims(); auto dim_0 = outputs[0].dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
input_row *= dim_0[i]; out_row *= dim_0[i];
} }
int output_col_0 = outputs[0].numel() / input_row; int out_col = outputs[0].numel() / out_row;
int input_col = 0; int in_col = 0, in_row = out_row;
bool sameShape = true; bool sameShape = true;
framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2); framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
framework::Vector<int> outputs_cols(num + 1); framework::Vector<int> outputs_cols(o_num + 1);
outputs_cols[0] = 0;
T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data()); T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
for (int i = 0; i < num; ++i) { outputs_cols[0] = 0;
int t_col = outputs[i].numel() / input_row; for (int i = 0; i < o_num; ++i) {
int t_col = outputs[i].numel() / out_row;
if (sameShape) { if (sameShape) {
if (t_col != output_col_0) sameShape = false; if (t_col != out_col) sameShape = false;
} }
input_col += t_col; in_col += t_col;
outputs_cols[i + 1] = input_col; outputs_cols[i + 1] = in_col;
outputs_ptr[i] = outputs[i].data<T>(); outputs_ptr[i] = outputs[i].data<T>();
} }
T** outs_gpu = T** dev_out_gpu_data =
reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace())); reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
// computation // computation
const int kThreadsPerBlock = 1024; const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock; int block_cols = kThreadsPerBlock;
if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32. if (in_col < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((input_col + 31) >> 5) << 5; block_cols = ((in_col + 31) >> 5) << 5;
} }
int block_rows = kThreadsPerBlock / block_cols; int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1); dim3 block_size = dim3(block_cols, block_rows, 1);
...@@ -249,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -249,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols = int grid_cols =
std::min((input_col + block_cols - 1) / block_cols, max_blocks); std::min((in_col + block_cols - 1) / block_cols, max_blocks);
int grid_rows = int grid_rows =
std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1)); std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1); dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) { if (sameShape) {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, output_col_0, outs_gpu); input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
} else { } else {
const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, outs_col_gpu, input.data<T>(), in_row, in_col, dev_outs_col_data,
static_cast<int>(outputs_cols.size()), outs_gpu); static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
} }
} }
}; };
......
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <hip/hip_runtime.h>
...@@ -278,6 +278,7 @@ void axpy<platform::CPUDeviceContext, double>( ...@@ -278,6 +278,7 @@ void axpy<platform::CPUDeviceContext, double>(
cblas_daxpy(n, alpha, x, 1, y, 1); cblas_daxpy(n, alpha, x, 1, y, 1);
} }
template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
template struct SetConstant<platform::CPUDeviceContext, float>; template struct SetConstant<platform::CPUDeviceContext, float>;
template struct SetConstant<platform::CPUDeviceContext, double>; template struct SetConstant<platform::CPUDeviceContext, double>;
template struct SetConstant<platform::CPUDeviceContext, int>; template struct SetConstant<platform::CPUDeviceContext, int>;
...@@ -371,6 +372,8 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>; ...@@ -371,6 +372,8 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, float>; template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>; template struct ColwiseSum<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, int>;
template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
template struct RowwiseSum<platform::CPUDeviceContext, float>; template struct RowwiseSum<platform::CPUDeviceContext, float>;
template struct RowwiseSum<platform::CPUDeviceContext, double>; template struct RowwiseSum<platform::CPUDeviceContext, double>;
......
...@@ -348,6 +348,7 @@ void axpy<platform::CUDADeviceContext, double>( ...@@ -348,6 +348,7 @@ void axpy<platform::CUDADeviceContext, double>(
&alpha, x, 1, y, 1)); &alpha, x, 1, y, 1));
} }
template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
template struct SetConstant<platform::CUDADeviceContext, float>; template struct SetConstant<platform::CUDADeviceContext, float>;
template struct SetConstant<platform::CUDADeviceContext, double>; template struct SetConstant<platform::CUDADeviceContext, double>;
template struct SetConstant<platform::CUDADeviceContext, int>; template struct SetConstant<platform::CUDADeviceContext, int>;
...@@ -422,6 +423,8 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> { ...@@ -422,6 +423,8 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
template struct RowwiseAdd<platform::CUDADeviceContext, float>; template struct RowwiseAdd<platform::CUDADeviceContext, float>;
template struct RowwiseAdd<platform::CUDADeviceContext, double>; template struct RowwiseAdd<platform::CUDADeviceContext, double>;
template struct ColwiseSum<platform::CUDADeviceContext, float>; template struct ColwiseSum<platform::CUDADeviceContext, float>;
template struct ColwiseSum<platform::CUDADeviceContext, int>;
template struct ColwiseSum<platform::CUDADeviceContext, int64_t>;
// template struct ColwiseSum<platform::CUDADeviceContext, double>; // template struct ColwiseSum<platform::CUDADeviceContext, double>;
// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode, // The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
// and only failed for this case. So reimplemented it. // and only failed for this case. So reimplemented it.
......
...@@ -19,13 +19,6 @@ limitations under the License. */ ...@@ -19,13 +19,6 @@ limitations under the License. */
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#ifdef PADDLE_USE_ATLAS
extern "C" {
#include <cblas.h>
#include <clapack.h>
}
#endif
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#include <lapacke.h> #include <lapacke.h>
......
...@@ -89,6 +89,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()( ...@@ -89,6 +89,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
XGrad->mutable_data<T>(context.GetPlace()))); XGrad->mutable_data<T>(context.GetPlace())));
} }
template class SoftmaxCUDNNFunctor<platform::float16>;
template class SoftmaxCUDNNFunctor<float>; template class SoftmaxCUDNNFunctor<float>;
template class SoftmaxCUDNNFunctor<double>; template class SoftmaxCUDNNFunctor<double>;
template class SoftmaxGradCUDNNFunctor<float>; template class SoftmaxGradCUDNNFunctor<float>;
......
...@@ -324,8 +324,9 @@ MatchIndices elements with value -1. ...@@ -324,8 +324,9 @@ MatchIndices elements with value -1.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp, REGISTER_OPERATOR(mine_hard_examples, ops::MineHardExamplesOp,
ops::MineHardExamplesOpMaker); ops::MineHardExamplesOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mine_hard_examples, mine_hard_examples,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename Functor>
class MKLDNNActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(context.Input<framework::Tensor>("X") != nullptr,
"Cannot get input tensor X, variable name = %s",
context.op().Input("X"));
PADDLE_ENFORCE(context.Output<framework::Tensor>("Out") != nullptr,
"Cannot find output tensor Out, variable name = %s",
context.op().Output("Out"));
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
template <typename Functor>
class MKLDNNActivationGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
namespace {
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx,
const framework::OperatorWithKernel& oper) {
framework::LibraryType library{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
}
#endif
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.GetPlace(), layout, library);
}
} // anonymous namespace
class ActivationWithMKLDNNOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetKernelType(ctx, *this);
}
};
class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetKernelType(ctx, *this);
}
};
} // namespace operators
} // namespace paddle
...@@ -168,7 +168,9 @@ https://arxiv.org/abs/1512.02325. ...@@ -168,7 +168,9 @@ https://arxiv.org/abs/1512.02325.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker); REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>, prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>); ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
...@@ -15,10 +15,12 @@ function(reader_library TARGET_NAME) ...@@ -15,10 +15,12 @@ function(reader_library TARGET_NAME)
PARENT_SCOPE) PARENT_SCOPE)
endfunction() endfunction()
reader_library(open_files_op SRCS open_files_op.cc)
reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc) reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc) reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc) reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc) reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
# Export local libraries to parent # Export local libraries to parent
set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE) set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
...@@ -48,20 +48,24 @@ class DoubleBufferReader : public framework::DecoratedReader { ...@@ -48,20 +48,24 @@ class DoubleBufferReader : public framework::DecoratedReader {
void start_thread() { void start_thread() {
buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize); buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
std::thread prefetch([this] { PrefetchThreadFunc(); }); prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
prefetch.detach();
} }
void ReadNext(std::vector<framework::LoDTensor>* out) override; void ReadNext(std::vector<framework::LoDTensor>* out) override;
void ReInit() override; void ReInit() override;
~DoubleBufferReader() { buffer_->Close(); } ~DoubleBufferReader() {
buffer_->Close();
prefetcher_.join();
delete buffer_;
}
bool HasNext() const override; bool HasNext() const override;
private: private:
void PrefetchThreadFunc(); void PrefetchThreadFunc();
std::thread prefetcher_;
framework::Channel<Item>* buffer_; framework::Channel<Item>* buffer_;
platform::Place place_; platform::Place place_;
std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_; std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
...@@ -120,10 +124,13 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -120,10 +124,13 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
}; };
void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) { void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
if (!HasNext()) {
PADDLE_THROW("There is no next data!");
}
if (local_buffer_.payloads_.empty()) { if (local_buffer_.payloads_.empty()) {
buffer_->Receive(&local_buffer_); buffer_->Receive(&local_buffer_);
} }
*out = local_buffer_.payloads_; *out = local_buffer_.payloads_;
local_buffer_.payloads_.clear(); local_buffer_.payloads_.clear();
if (local_buffer_.ctx_) { if (local_buffer_.ctx_) {
...@@ -134,6 +141,8 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) { ...@@ -134,6 +141,8 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
void DoubleBufferReader::ReInit() { void DoubleBufferReader::ReInit() {
reader_->ReInit(); reader_->ReInit();
buffer_->Close(); buffer_->Close();
prefetcher_.join();
delete buffer_;
start_thread(); start_thread();
} }
...@@ -157,13 +166,16 @@ void DoubleBufferReader::PrefetchThreadFunc() { ...@@ -157,13 +166,16 @@ void DoubleBufferReader::PrefetchThreadFunc() {
std::swap(gpu_batch, batch.payloads_); std::swap(gpu_batch, batch.payloads_);
} }
if (!buffer_->Send(&batch)) { try {
buffer_->Send(&batch);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The " VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread terminates."; "prefetch thread will terminate.";
break; break;
} }
} }
buffer_->Close(); buffer_->Close();
VLOG(5) << "Prefetch thread terminates.";
} }
bool DoubleBufferReader::HasNext() const { bool DoubleBufferReader::HasNext() const {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class MultiPassReader : public framework::DecoratedReader {
public:
MultiPassReader(ReaderBase* reader, int pass_num)
: DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (!HasNext()) {
PADDLE_THROW("There is no next data!");
}
reader_->ReadNext(out);
}
bool HasNext() const override {
if (reader_->HasNext()) {
return true;
} else {
++pass_count_;
if (pass_count_ >= pass_num_) {
return false;
} else {
reader_->ReInit();
return true;
}
}
}
void ReInit() override {
pass_count_ = 0;
reader_->ReInit();
}
private:
int pass_num_;
mutable int pass_count_;
};
class CreateMultiPassReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto& out = detail::Ref(scope.FindVar(Output("Out")));
int pass_num = Attr<int>("pass_num");
out.GetMutable<framework::ReaderHolder>()->Reset(
new MultiPassReader(underlying_reader.Get(), pass_num));
}
};
class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase {
public:
CreateMultiPassReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: DecoratedReaderMakerBase(op_proto, op_checker) {
AddAttr<int>("pass_num", "The number of pass to run.").GreaterThan(0);
AddComment(R"DOC(
CreateMultiPassReader Operator
This operator creates a multi-pass reader. A multi-pass reader
is used to yield data for several pass training continuously.
It takes the the number of pass to run as one of its attributes
('pass_num'), and maintains a pass counter to record how many
passes it has completed. When the underlying reader reach the EOF,
the multi-pass reader checks whether it has completed training
of the given number of pass. If not, the underlying reader will
be re-initialized and starts a new pass automatically.
)DOC");
}
};
} // namespace reader
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators::reader;
REGISTER_DECORATED_READER_OPERATOR(create_multi_pass_reader,
ops::CreateMultiPassReaderOp,
ops::CreateMultiPassReaderOpMaker);
...@@ -34,6 +34,9 @@ class ShuffleReader : public framework::DecoratedReader { ...@@ -34,6 +34,9 @@ class ShuffleReader : public framework::DecoratedReader {
} }
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (!HasNext()) {
PADDLE_THROW("There is no next data!");
}
if (iteration_pos_ >= buffer_.size()) { if (iteration_pos_ >= buffer_.size()) {
VLOG(10) << "Resetting shuffle buffer"; VLOG(10) << "Resetting shuffle buffer";
ReadIntoBuffers(); ReadIntoBuffers();
...@@ -50,7 +53,6 @@ class ShuffleReader : public framework::DecoratedReader { ...@@ -50,7 +53,6 @@ class ShuffleReader : public framework::DecoratedReader {
buffer_.clear(); buffer_.clear();
buffer_.reserve(buffer_size_); buffer_.reserve(buffer_size_);
iteration_pos_ = 0; iteration_pos_ = 0;
PADDLE_ENFORCE(reader_->HasNext());
for (size_t i = 0; i < buffer_size_; ++i) { for (size_t i = 0; i < buffer_size_; ++i) {
if (!reader_->HasNext()) { if (!reader_->HasNext()) {
break; break;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class MultipleReader : public framework::ReaderBase {
public:
MultipleReader(const std::vector<std::string>& file_names,
const std::vector<framework::DDim>& dims, size_t thread_num)
: file_names_(file_names), dims_(dims) {
prefetchers_.resize(thread_num);
StartNewScheduler();
}
void ReadNext(std::vector<framework::LoDTensor>* out) override;
bool HasNext() const override;
void ReInit() override;
~MultipleReader() { EndScheduler(); }
private:
void StartNewScheduler();
void EndScheduler();
void ScheduleThreadFunc();
void PrefetchThreadFunc(std::string file_name, size_t thread_idx);
std::vector<std::string> file_names_;
std::vector<framework::DDim> dims_;
std::thread scheduler_;
std::vector<std::thread> prefetchers_;
framework::Channel<size_t>* waiting_file_idx_;
framework::Channel<size_t>* available_thread_idx_;
framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
mutable std::vector<framework::LoDTensor> local_buffer_;
};
void MultipleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
if (!HasNext()) {
PADDLE_THROW("There is no next data!");
}
if (local_buffer_.empty()) {
buffer_->Receive(&local_buffer_);
}
*out = local_buffer_;
local_buffer_.clear();
}
bool MultipleReader::HasNext() const {
return local_buffer_.empty() ? buffer_->Receive(&local_buffer_) : true;
}
void MultipleReader::ReInit() {
EndScheduler();
local_buffer_.clear();
StartNewScheduler();
}
void MultipleReader::StartNewScheduler() {
size_t thread_num = prefetchers_.size();
waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
available_thread_idx_ = framework::MakeChannel<size_t>(thread_num);
buffer_ =
framework::MakeChannel<std::vector<framework::LoDTensor>>(thread_num);
for (size_t i = 0; i < file_names_.size(); ++i) {
waiting_file_idx_->Send(&i);
}
waiting_file_idx_->Close();
for (size_t i = 0; i < thread_num; ++i) {
available_thread_idx_->Send(&i);
}
scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
}
void MultipleReader::EndScheduler() {
available_thread_idx_->Close();
buffer_->Close();
waiting_file_idx_->Close();
if (scheduler_.joinable()) {
scheduler_.join();
}
delete buffer_;
delete available_thread_idx_;
delete waiting_file_idx_;
}
void MultipleReader::ScheduleThreadFunc() {
VLOG(5) << "MultipleReader schedule thread starts.";
size_t completed_thread_num = 0;
size_t thread_idx;
while (available_thread_idx_->Receive(&thread_idx)) {
std::thread& prefetcher = prefetchers_[thread_idx];
if (prefetcher.joinable()) {
prefetcher.join();
}
size_t file_idx;
if (waiting_file_idx_->Receive(&file_idx)) {
// Still have files to read. Start a new prefetch thread.
std::string file_name = file_names_[file_idx];
prefetcher = std::thread([this, file_name, thread_idx] {
PrefetchThreadFunc(file_name, thread_idx);
});
} else {
// No more file to read.
++completed_thread_num;
if (completed_thread_num == prefetchers_.size()) {
buffer_->Close();
break;
}
}
}
// If users invoke ReInit() when scheduler is running, it will close the
// 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
// to release their resource. So a check is needed before scheduler ends.
for (auto& p : prefetchers_) {
if (p.joinable()) {
p.join();
}
}
VLOG(5) << "MultipleReader schedule thread terminates.";
}
void MultipleReader::PrefetchThreadFunc(std::string file_name,
size_t thread_idx) {
VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
std::unique_ptr<framework::ReaderBase> reader =
CreateReaderByFileName(file_name, dims_);
while (reader->HasNext()) {
std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins);
try {
buffer_->Send(&ins);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
"thread of file '"
<< file_name << "' will terminate.";
break;
}
}
try {
available_thread_idx_->Send(&thread_idx);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
"Fail to send thread_idx.";
}
VLOG(5) << "The prefetch thread of file '" << file_name << "' terminates.";
}
class OpenFilesOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()),
"The accumulate of all ranks should be equal to the "
"shape concat's length.");
const auto& file_names = Attr<std::vector<std::string>>("file_names");
PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
const size_t thread_num = Attr<int>("thread_num");
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new MultipleReader(
file_names, RestoreShapes(shape_concat, ranks), thread_num));
}
};
class OpenFilesOpMaker : public FileReaderMakerBase {
public:
OpenFilesOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: FileReaderMakerBase(op_proto, op_checker) {
AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
.GreaterThan(0);
AddComment(R"DOC(
OpenFiles Operator
An OpenFilesOp creates a MultipleReader, which is able to
read data multi-threaded from multiple files.
)DOC");
}
};
} // namespace reader
} // namespace operators
} // namespace paddle
namespace reader = paddle::operators::reader;
REGISTER_FILE_READER_OPERATOR(open_files, reader::OpenFilesOp,
reader::OpenFilesOpMaker);
...@@ -36,6 +36,21 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() { ...@@ -36,6 +36,21 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
return regs; return regs;
} }
std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
const std::string& file_name, const std::vector<framework::DDim>& dims) {
size_t separator_pos = file_name.find_last_of(kFileFormatSeparator);
PADDLE_ENFORCE_NE(separator_pos, std::string::npos,
"File name illegal! A legal file name should be like: "
"[file_name].[file_format] (e.g., 'data_file.recordio').");
std::string filetype = file_name.substr(separator_pos + 1);
auto itor = FileReaderRegistry().find(filetype);
PADDLE_ENFORCE(itor != FileReaderRegistry().end(),
"No file reader registered for '%s' format.", filetype);
framework::ReaderBase* reader = (itor->second)(file_name, dims);
return std::unique_ptr<framework::ReaderBase>(reader);
}
FileReaderMakerBase::FileReaderMakerBase( FileReaderMakerBase::FileReaderMakerBase(
framework::OpProtoAndCheckerMaker::OpProto* op_proto, framework::OpProtoAndCheckerMaker::OpProto* op_proto,
framework::OpAttrChecker* op_checker) framework::OpAttrChecker* op_checker)
......
...@@ -21,6 +21,8 @@ namespace paddle { ...@@ -21,6 +21,8 @@ namespace paddle {
namespace operators { namespace operators {
namespace reader { namespace reader {
static constexpr char kFileFormatSeparator[] = ".";
using FileReaderCreator = std::function<framework::ReaderBase*( using FileReaderCreator = std::function<framework::ReaderBase*(
const std::string&, const std::vector<framework::DDim>&)>; const std::string&, const std::vector<framework::DDim>&)>;
...@@ -29,12 +31,15 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry(); ...@@ -29,12 +31,15 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry();
template <typename Reader> template <typename Reader>
int RegisterFileReader(const std::string& filetype) { int RegisterFileReader(const std::string& filetype) {
FileReaderRegistry()[filetype] = []( FileReaderRegistry()[filetype] = [](
const std::string& fn, const std::vector<paddle::framework::DDim>& dim) { const std::string& fn, const std::vector<framework::DDim>& dims) {
return new Reader(fn, dim); return new Reader(fn, dims);
}; };
return 0; return 0;
} }
std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
const std::string& file_name, const std::vector<framework::DDim>& dims);
extern std::vector<framework::DDim> RestoreShapes( extern std::vector<framework::DDim> RestoreShapes(
const std::vector<int>& shape_concat, const std::vector<int>& ranks); const std::vector<int>& shape_concat, const std::vector<int>& ranks);
......
...@@ -27,6 +27,7 @@ namespace operators { ...@@ -27,6 +27,7 @@ namespace operators {
static constexpr char kX[] = "X"; static constexpr char kX[] = "X";
static constexpr char kCaseToExecute[] = "case_to_execute"; static constexpr char kCaseToExecute[] = "case_to_execute";
static constexpr char kOutputs[] = "Out";
static constexpr char kCases[] = "cases"; static constexpr char kCases[] = "cases";
static constexpr char kCasesBlock[] = "sub_block"; static constexpr char kCasesBlock[] = "sub_block";
...@@ -388,6 +389,10 @@ class SelectOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -388,6 +389,10 @@ class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
"(Int) The variable the sets the index of the case to execute, " "(Int) The variable the sets the index of the case to execute, "
"after evaluating the channels being sent to and received from") "after evaluating the channels being sent to and received from")
.AsDuplicable(); .AsDuplicable();
AddOutput(kOutputs,
"A set of variables, which will be assigned with values "
"generated by the operators inside the cases of Select Op.")
.AsDuplicable();
AddAttr<std::vector<std::string>>(kCases, AddAttr<std::vector<std::string>>(kCases,
"(String vector) Serialized list of" "(String vector) Serialized list of"
"all cases in the select op. Each" "all cases in the select op. Each"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle {
namespace operators {
class SendBarrierOp : public framework::OperatorBase {
public:
SendBarrierOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
// need to wait before sending send_barrier message
PADDLE_ENFORCE(rpc_client->Wait());
for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep);
}
PADDLE_ENFORCE(rpc_client->Wait());
}
};
class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SendBarrierOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput("RPCClient",
"(RPCClient) The RPC client object which is"
"initialized at most once.");
AddComment(R"DOC(
SendBarrier operator
This operator will send a send barrier signal to list_and_serv op, so that
the Parameter Server would knew all variables have been sent.
)DOC");
AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.")
.SetDefault({"127.0.0.1:6164"});
}
};
class SendBarrierOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output("RPCClient").front();
auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
}
};
class SendBarrierOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
ops::SendBarrierOpVarTypeInference,
ops::SendBarrierOpShapeInference);
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include <future> #include <future>
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase { ...@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
auto client_var_name = Output("RPCClient"); auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.", "Can not find variable '%s' in the scope.",
...@@ -68,7 +72,7 @@ class SendOp : public framework::OperatorBase { ...@@ -68,7 +72,7 @@ class SendOp : public framework::OperatorBase {
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; VLOG(2) << "sending " << ins[i] << " to " << epmap[i];
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} else { } else {
VLOG(3) << "don't send no-initialied variable: " << ins[i]; VLOG(3) << "don't send no-initialied variable: " << ins[i];
...@@ -77,20 +81,20 @@ class SendOp : public framework::OperatorBase { ...@@ -77,20 +81,20 @@ class SendOp : public framework::OperatorBase {
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
for (auto& ep : endpoints) { for (auto& ep : endpoints) {
VLOG(3) << "batch barrier, ep: " << ep; VLOG(2) << "batch barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep); rpc_client->AsyncSendBatchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
if (outs.size() > 0) { if (outs.size() > 0) {
for (size_t i = 0; i < outs.size(); i++) { for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
} }
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
// tell pservers that current trainer have called fetch // tell pservers that current trainer have called fetch
for (auto& ep : endpoints) { for (auto& ep : endpoints) {
VLOG(3) << "send fetch barrier, ep: " << ep; VLOG(2) << "send fetch barrier, ep: " << ep;
rpc_client->AsyncSendFetchBarrier(ep); rpc_client->AsyncSendFetchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle {
namespace operators {
static bool NeedSend(const framework::Scope& scope,
const std::string& varname) {
auto* var = scope.FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
varname);
if (var->IsType<framework::LoDTensor>()) {
return var->Get<framework::LoDTensor>().IsInitialized();
} else if (var->IsType<framework::SelectedRows>()) {
return var->Get<framework::SelectedRows>().rows().size() > 0UL;
} else {
PADDLE_THROW(
"Variable type in send side should be in "
"[LodTensor, SelectedRows]");
}
return false;
}
class SendVarsOp : public framework::OperatorBase {
public:
SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
auto ins = Inputs("X");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
int sync_send = Attr<int>("sync_sent");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
// TODO(Yancey1989): we need to use an IO threadpool which has
// a larger number of threads than the computing threadpool.
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} else {
VLOG(3) << "don't send no-initialied variable: " << ins[i];
}
}
if (sync_send) {
rpc_client->Wait();
}
}
};
class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SendVarsOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
.AsDuplicable();
AddOutput("RPCClient",
"(RPCClient) The RPC client object which will be"
"initialized at most once.");
AddComment(R"DOC(
Send operator
This operator will send variables to listen_and_serve op at the parameter server.
)DOC");
AddAttr<int>("ync_send",
"(int, default 0)"
"sync send or async send.")
.SetDefault(0);
AddAttr<std::vector<std::string>>("epmap",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input "
"variables for mapping")
.SetDefault({"127.0.0.1:6164"});
}
};
class SendVarsOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output("RPCClient").front();
auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
}
};
class SendVarsOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
ops::SendVarsOpVarTypeInference,
ops::SendVarsOpShapeInference);
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::Tensor; using framework::LoDTensor;
class SequenceExpandOp : public framework::OperatorWithKernel { class SequenceExpandOp : public framework::OperatorWithKernel {
public: public:
...@@ -25,15 +25,71 @@ class SequenceExpandOp : public framework::OperatorWithKernel { ...@@ -25,15 +25,71 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X")); PADDLE_ENFORCE(ctx->HasInput("X"),
PADDLE_ENFORCE(ctx->HasOutput("Out")); "Input(X) of SequenceExpandOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y")); PADDLE_ENFORCE(ctx->HasInput("Y"),
framework::DDim out_dim; "Input(Y) of SequenceExpandOp should not be null.");
auto y_dim = ctx->GetInputDim("Y"); PADDLE_ENFORCE(ctx->HasOutput("Out"),
out_dim = ctx->GetInputDim("X"); "Output(Out) of SequenceExpandOp should not be null.");
out_dim[0] = y_dim[0];
ctx->ShareLoD("Y", "Out"); auto x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", out_dim); auto out_dims = x_dims;
int ref_level = ctx->Attrs().Get<int>("ref_level");
PADDLE_ENFORCE_GE(x_dims.size(), 2,
"Dimension number of Input(X) should be at least 2.");
if (ctx->IsRuntime()) {
framework::Variable* x_var =
boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
framework::Variable* y_var =
boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
auto& x_lod = x_var->Get<LoDTensor>().lod();
auto& y_lod = y_var->Get<LoDTensor>().lod();
PADDLE_ENFORCE_LE(x_lod.size(), 1,
"Level number of Input(X)'s lod should not be "
"greater than 1.");
PADDLE_ENFORCE_GT(y_lod.size(), 0,
"Level number of Input(Y)'s lod should be "
"greater than 0.");
PADDLE_ENFORCE(
ref_level == -1 ||
(ref_level >= 0 && ref_level < static_cast<int>(y_lod.size())),
"Invlid `ref_level`, which should be either equal to -1 "
"or in [0, %d)",
y_lod.size());
if (ref_level == -1) ref_level = y_lod.size() - 1;
if (x_lod.size() > 0) {
PADDLE_ENFORCE(x_lod[0].size() == y_lod[ref_level].size(),
"Level number of Input(X)'s lod could be 0. Otherwise "
"size of Input(X)'s first level lod should be equal to "
"size of Input(Y)'s referred level lod.");
}
int64_t out_first_dim = 0;
if (y_lod[ref_level].size() <= 1) {
out_first_dim = x_dims[0];
} else {
for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
int x_seq_len = 1;
if (x_lod.size() == 1) {
x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
}
out_first_dim +=
(y_lod[ref_level][i] - y_lod[ref_level][i - 1]) * x_seq_len;
}
}
out_dims[0] = out_first_dim;
ctx->SetOutputDim("Out", out_dims);
} else {
out_dims[0] = -1;
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
} }
}; };
...@@ -42,83 +98,81 @@ class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -42,83 +98,81 @@ class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker) SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", AddInput("X",
"(Tensor or LoDTensor) The input(X) of this operator can be a " "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
"LoDTensor or a base Tensor."); "level is at most 1.");
AddInput("Y", AddInput("Y",
"(LoDTensor)The reference input(Y) of sequence_expand op." "(LoDTensor, default LoDTensor<float>) Referred LoDTensor whose "
"It must be a LoDTensor with k-level(k>0)." "lod (specified level) is referred by Input(X).");
"The input(X) will be expanded according to LOD of input(Y)."
"The element numbers of last level in input(Y) "
"must be equal to dims[0] of input(X).");
AddOutput("Out", AddOutput("Out",
"(LodTensor)The output of sequence_expand op." "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
"The lod of output will be as same as input(Y)'s lod."); "generated from Input(X) by referring lod of Input(Y).");
AddAttr<int>("ref_level", "Specify lod level of Input(Y).").SetDefault(-1);
AddComment(R"DOC( AddComment(R"DOC(
Sequence Expand Operator. Sequence Expand Operator.
This operator expands input(X) according to LOD of input(Y). This operator expands `X` according to specified level lod of `Y`. Current
implementation constaints that lod level of `X` should be at most 1. Attribute
`ref_level` is used to specify which level lod of `Y` is referred to expand `X`.
If set `ref_level` to -1, then last level lod of `Y` would be referred.
Please note, rank of `X` should be at least 2, when the rank exceeds 2, `X`
would be viewed as a 2-D tensor.
Following are cases to better explain how this works: Following are cases to better explain how this works:
Case 1: Case 1:
Given a 2-level LoDTensor input(X) Given a 1-level LoDTensor input(X)
X.lod = [[0, 2, 3], X.lod = [[0, 2, 4]]
[0, 1, 3, 4]] X.data = [[a], [b], [c], [d]]
X.data = [a, b, c, d]
X.dims = [4, 1] X.dims = [4, 1]
and input(Y) and input(Y)
Y.lod = [[0, 2, 4], Y.lod = [[0, 2, 4],
[0, 3, 6, 7, 8]] [0, 3, 6, 7, 8]]
with condition len(Y.lod[-1]) -1 == X.dims[0] ref_level: 0
then we get 2-level LoDTensor then we get 1-level LoDTensor
Out.lod = [[0, 2, 4], Out.lod = [[0, 2, 4, 6, 8]]
[0, 3, 6, 7, 8]] Out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
Out.data = [a, a, a, b, b, b, c, d]
Out.dims = [8, 1] Out.dims = [8, 1]
Case 2: Case 2:
Given 1-level LoDTensor input(X)
X.lod = [[0, 1, 4]]
X.data = [[a], [b], [c], [d]]
X.dims = [4, 1]
and input(Y)
Y.lod = [[0, 2, 4],
[0, 3, 6, 6, 8]]
ref_level: 0
then we get 1-level LoDTensor
Out.lod = [[0, 1, 2, 5, 8]]
Out.data = [[a], [a], [b], [c], [d], [b], [c], [d]]
Out.dims = [8, 1]
Case 3:
Given a common Tensor input(X) Given a common Tensor input(X)
X.data = [a, b, c] X.data = [[a], [b], [c]]
X.dims = [3, 1] X.dims = [3, 1]
and input(Y) and input(Y)
Y.lod = [[0, 2, 3, 6]] Y.lod = [[0, 2, 3, 6]]
with condition len(Y.lod[-1]) -1 == X.dims[0] ref_level: -1
then we get 1-level LoDTensor then we get a common Tensor
Out.lod = [[0, 2, 3, 6]] Out.data = [[a], [a], [b], [c], [c], [c]]
Out.data = [a, a, b, c, c, c]
Out.dims = [6, 1] Out.dims = [6, 1]
Case 3: Case 4:
Given a common Tensor input(X) Given a common Tensor input(X)
X.data = [[a, b], [c, d], [e, f]] X.data = [[a, b], [c, d], [e, f]]
X.dims = [3, 2] X.dims = [3, 2]
and input(Y) and input(Y)
Y.lod = [[0, 2, 3, 6]] Y.lod = [[0, 2, 3, 6]]
with condition len(Y.lod[-1]) -1 == X.dims[0] ref_level: 0
then we get 1-level LoDTensor then we get a common LoDTensor
Out.lod = [[0, 2, 3, 6]] Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]]
Out.dims = [6, 2] Out.dims = [6, 2]
Case 4:
Given 2-level a LoDTensor input(X)
X.lod = [[0, 2, 3],
[0, 1, 3, 4]]
X.data = [a, b, c, d]
X.dims = [4, 1]
and input(Y)
Y.lod = [[0, 2, 4],
[0, 3, 6, 6, 8]]
with condition len(Y.lod[-1]) -1 == X.dims[0]
then we get 2-level LoDTensor
Out.lod = [[0, 2, 4],
[0, 3, 6, 6, 8]]
Out.data = [a, a, a, b, b, b, d, d]
Out.dims = [8, 1]
)DOC"); )DOC");
} }
}; };
...@@ -129,12 +183,14 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel { ...@@ -129,12 +183,14 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X")); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Out")); PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"The input(Out@GRAD) should not be null"); "Input(Out@GRAD) should not be null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto x_grad_name = framework::GradVarName("X"); auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) { if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims); ctx->SetOutputDim(x_grad_name, x_dims);
} }
...@@ -149,7 +205,13 @@ REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker, ...@@ -149,7 +205,13 @@ REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
sequence_expand_grad, ops::SequenceExpandOpGrad); sequence_expand_grad, ops::SequenceExpandOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
sequence_expand, sequence_expand,
ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>); ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, double>,
ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int>,
ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
sequence_expand_grad, sequence_expand_grad,
ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>); ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -150,7 +150,14 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> { ...@@ -150,7 +150,14 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
sequence_expand, sequence_expand,
ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>); ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, double>,
ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int>,
ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
sequence_expand_grad, sequence_expand_grad,
ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>); ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
...@@ -56,7 +56,9 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> { ...@@ -56,7 +56,9 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(softmax, CUDNN, ::paddle::platform::CUDAPlace, namespace plat = paddle::platform;
ops::SoftmaxCUDNNKernel<float>); REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
REGISTER_OP_KERNEL(softmax_grad, CUDNN, ::paddle::platform::CUDAPlace, ops::SoftmaxCUDNNKernel<float>,
ops::SoftmaxCUDNNKernel<plat::float16>);
REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
ops::SoftmaxGradCUDNNKernel<float>); ops::SoftmaxGradCUDNNKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include <iostream>
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory; // Note: paddle has also "memory" namespace
using mkldnn::primitive;
using mkldnn::softmax_forward;
using mkldnn::prop_kind;
using mkldnn::stream;
template <typename T>
class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("X");
Tensor* output = ctx.Output<Tensor>("Out");
PADDLE_ENFORCE(input->dims().size() == 2UL,
"The input of softmax op must be a 2D matrix.");
const T* input_data = input->data<T>();
// allocate memory for output
T* output_data = output->mutable_data<T>(ctx.GetPlace());
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// MKL-DNN does support softmax over selected axis. Having 2D Tensor,
// we will make normalization after final eg. axis: 1
PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
"Softmax input and output dimensions should match");
// Same memory descriptor to be used for input and output
memory::dims softmax_tz = {src_tz[0], src_tz[1]};
// Currently only supports NC data format
// TODO(jczaja-intel): support more formats
auto softmax_md =
MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
// Normalization is made after innermost dimension eg. C out of NC
auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
softmax_md, 1 /*dim: C*/);
// create memory primitives
auto softmax_src_memory =
memory({softmax_md, mkldnn_engine}, (void*)input_data);
auto softmax_dst_memory =
memory({softmax_md, mkldnn_engine}, (void*)output_data);
auto softmax_prim_desc =
softmax_forward::primitive_desc(softmax_desc, mkldnn_engine);
auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory,
softmax_dst_memory);
std::vector<primitive> pipeline{softmax};
stream(stream::kind::eager).submit(pipeline).wait();
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
ops::SoftmaxMKLDNNKernel<float>);
...@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/operators/softmax_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -38,26 +44,32 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -38,26 +44,32 @@ class SoftmaxOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
// choose cudnn kernel if the runtime supported. // choose cudnn kernel if the runtime supported.
bool use_cudnn = ctx.Attr<bool>("use_cudnn"); framework::LibraryType library_{framework::LibraryType::kPlain};
bool runtime_cudnn_support = false;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::CanCUDNNBeUsed(ctx)) {
auto& dev_ctx = library_ = framework::LibraryType::kCUDNN;
ctx.template device_context<platform::CUDADeviceContext>();
runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
} }
#endif #endif
framework::LibraryType library_ = framework::LibraryType::kPlain; #ifdef PADDLE_WITH_MKLDNN
if (use_cudnn && runtime_cudnn_support) { if (library_ == framework::LibraryType::kPlain &&
library_ = framework::LibraryType::kCUDNN; platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
} }
#endif
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type());
if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used");
}
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
return framework::OpKernelType( return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(), framework::StringToDataLayout(data_format),
framework::StringToDataLayout(data_format), library_); library_);
} }
}; };
class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker) SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
...@@ -77,6 +89,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -77,6 +89,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
"Defaults to \"NHWC\". Specify the data format of the output data, " "Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ") "the input will be transformed automatically. ")
.SetDefault("AnyLayout"); .SetDefault("AnyLayout");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Softmax Operator. Softmax Operator.
...@@ -119,19 +134,12 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ...@@ -119,19 +134,12 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
// choose cudnn kernel if the runtime supported. // choose cudnn kernel if the runtime supported.
bool use_cudnn = ctx.Attr<bool>("use_cudnn"); framework::LibraryType library_{framework::LibraryType::kPlain};
bool runtime_cudnn_support = false;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::CanCUDNNBeUsed(ctx)) {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
}
#endif
framework::LibraryType library_ = framework::LibraryType::kPlain;
if (use_cudnn && runtime_cudnn_support) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kCUDNN;
} }
#endif
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/split_ids_op.h"
namespace paddle {
namespace operators {
class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SplitIdsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
AddOutput("Out", "(LoDTensor) The outputs of the input Ids.")
.AsDuplicable();
AddComment(R"DOC(
Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
Example:
Input:
X = [1,2,3,4,5,6]
Out(3 output):
out0 = [3, 6]
out1 = [1, 4]
out2 = [2, 5]
)DOC");
}
};
class SplitIdsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids.");
PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
auto ids_var_type = ctx->GetInputsVarType("Ids").front();
PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
auto ids_dims = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
}
};
class SplitIdsOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &out_var : op_desc.Output("Out")) {
block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
ops::SplitIdsOpInferVarType);
REGISTER_OP_CPU_KERNEL(
split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SplitIdsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto place = ctx.GetPlace();
if (!platform::is_cpu_place(place)) {
PADDLE_THROW("SplitIds do not support GPU kernel");
}
const auto* ids_t = ctx.Input<framework::LoDTensor>("Ids");
auto& ids_dims = ids_t->dims();
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
const T* ids = ids_t->data<T>();
const size_t shard_num = outs.size();
std::vector<std::vector<T>> out_ids;
out_ids.resize(outs.size());
// split id by their shard_num.
for (size_t i = 0; i < ids_dims[0]; ++i) {
T id = ids[i];
size_t shard_id = static_cast<size_t>(id) % shard_num;
out_ids[shard_id].push_back(id);
}
// create tensor for each shard and send to parameter server
for (size_t i = 0; i < out_ids.size(); ++i) {
auto* shard_t = outs[i];
std::vector<T> ids = out_ids[i];
auto* shard_data = shard_t->mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
for (size_t i = 0; i < ids.size(); ++i) {
shard_data[i] = ids[i];
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -153,8 +153,8 @@ template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float, ...@@ -153,8 +153,8 @@ template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float,
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp, REGISTER_OPERATOR(target_assign, ops::TargetAssignOp, ops::TargetAssignOpMaker,
ops::TargetAssignOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
target_assign, target_assign,
ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>, ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,
......
...@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ ...@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
......
...@@ -86,7 +86,8 @@ class CudnnDataType<float16> { ...@@ -86,7 +86,8 @@ class CudnnDataType<float16> {
public: public:
static const cudnnDataType_t type = CUDNN_DATA_HALF; static const cudnnDataType_t type = CUDNN_DATA_HALF;
// The scaling param type is float for HALF and FLOAT tensors // The scaling param type is float for HALF and FLOAT tensors
typedef const float ScalingParamType; using ScalingParamType = const float;
using BatchNormParamType = float;
static ScalingParamType* kOne() { static ScalingParamType* kOne() {
static ScalingParamType v = 1.0; static ScalingParamType v = 1.0;
return &v; return &v;
...@@ -101,7 +102,8 @@ template <> ...@@ -101,7 +102,8 @@ template <>
class CudnnDataType<float> { class CudnnDataType<float> {
public: public:
static const cudnnDataType_t type = CUDNN_DATA_FLOAT; static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
typedef const float ScalingParamType; using ScalingParamType = const float;
using BatchNormParamType = float;
static ScalingParamType* kOne() { static ScalingParamType* kOne() {
static ScalingParamType v = 1.0; static ScalingParamType v = 1.0;
return &v; return &v;
...@@ -116,7 +118,8 @@ template <> ...@@ -116,7 +118,8 @@ template <>
class CudnnDataType<double> { class CudnnDataType<double> {
public: public:
static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
typedef const double ScalingParamType; using ScalingParamType = const double;
using BatchNormParamType = double;
static ScalingParamType* kOne() { static ScalingParamType* kOne() {
static ScalingParamType v = 1.0; static ScalingParamType v = 1.0;
return &v; return &v;
......
...@@ -10,43 +10,45 @@ See the License for the specific language governing permissions and ...@@ -10,43 +10,45 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include <unordered_set>
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
DeviceContextPool* DeviceContextPool::pool = nullptr; DeviceContextPool* DeviceContextPool::pool = nullptr;
const platform::DeviceContext* DeviceContextPool::Get( platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
const platform::Place& place) {
auto it = device_contexts_.find(place); auto it = device_contexts_.find(place);
if (it == device_contexts_.end()) { if (it == device_contexts_.end()) {
PADDLE_THROW( PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU " "'Place' is not supported, Please re-compile with WITH_GPU "
"option"); "option");
} }
return it->second; return it->second.get();
} }
DeviceContextPool::DeviceContextPool( DeviceContextPool::DeviceContextPool(
const std::vector<platform::Place>& places) { const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0); PADDLE_ENFORCE_GT(places.size(), 0);
for (size_t i = 0; i < places.size(); i++) { using PtrType = std::unique_ptr<DeviceContext>;
if (platform::is_cpu_place(places[i])) { std::unordered_set<Place, PlaceHash> set;
for (auto& p : places) {
set.insert(p);
}
for (auto& p : set) {
if (platform::is_cpu_place(p)) {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
device_contexts_.emplace(places[i], device_contexts_.emplace(
new platform::MKLDNNDeviceContext( p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
boost::get<platform::CPUPlace>(places[i])));
#else #else
device_contexts_.emplace(places[i], device_contexts_.emplace(
new platform::CPUDeviceContext( p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
boost::get<platform::CPUPlace>(places[i])));
#endif #endif
} else if (platform::is_gpu_place(places[i])) { } else if (platform::is_gpu_place(p)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(places[i], device_contexts_.emplace(
new platform::CUDADeviceContext( p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
boost::get<platform::CUDAPlace>(places[i])));
#else #else
PADDLE_THROW( PADDLE_THROW(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU " "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
...@@ -159,6 +161,7 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -159,6 +161,7 @@ CUDADeviceContext::~CUDADeviceContext() {
Place CUDADeviceContext::GetPlace() const { return place_; } Place CUDADeviceContext::GetPlace() const { return place_; }
void CUDADeviceContext::Wait() const { void CUDADeviceContext::Wait() const {
std::lock_guard<std::mutex> guard(mutex_);
PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
PADDLE_ENFORCE(cudaGetLastError()); PADDLE_ENFORCE(cudaGetLastError());
} }
......
...@@ -103,6 +103,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -103,6 +103,7 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<Eigen::GpuDevice> eigen_device_; std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_; std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
mutable std::mutex mutex_;
cudaStream_t stream_; cudaStream_t stream_;
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
...@@ -159,7 +160,7 @@ class DeviceContextPool { ...@@ -159,7 +160,7 @@ class DeviceContextPool {
} }
/*! \brief Return handle of single device context. */ /*! \brief Return handle of single device context. */
const platform::DeviceContext* Get(const platform::Place& place); platform::DeviceContext* Get(const platform::Place& place);
template <typename Place> template <typename Place>
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace( const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
...@@ -172,19 +173,8 @@ class DeviceContextPool { ...@@ -172,19 +173,8 @@ class DeviceContextPool {
private: private:
static DeviceContextPool* pool; static DeviceContextPool* pool;
constexpr static int LEFT_SHIFT = 8; std::unordered_map<const platform::Place,
struct Hash { std::unique_ptr<platform::DeviceContext>, PlaceHash>
std::hash<int> hash_;
size_t operator()(const platform::Place& place) const {
int pre_hash = place.which() << LEFT_SHIFT;
if (platform::is_gpu_place(place)) {
pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
}
return hash_(pre_hash);
}
};
std::unordered_map<const platform::Place, const platform::DeviceContext*,
Hash>
device_contexts_; device_contexts_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool); DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
}; };
......
...@@ -483,9 +483,124 @@ DEVICE inline bool operator>=(const half& a, const half& b) { ...@@ -483,9 +483,124 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
#endif // PADDLE_CUDA_FP16 #endif // PADDLE_CUDA_FP16
// Arithmetic operators on ARMv8.2-A CPU // Arithmetic operators for float16 on GPU
#if defined(PADDLE_WITH_NATIVE_FP16) #if defined(PADDLE_CUDA_FP16)
HOST inline float16 operator+(const float16& a, const float16& b) { HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hadd(half(a), half(b)));
#else
return float16(float(a) + float(b));
#endif
}
HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hsub(half(a), half(b)));
#else
return float16(float(a) - float(b));
#endif
}
HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hmul(half(a), half(b)));
#else
return float16(float(a) * float(b));
#endif
}
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
// TODO(kexinzhao): check which cuda version starts to support __hdiv
float num = __half2float(half(a));
float denom = __half2float(half(b));
return float16(num / denom);
#else
return float16(float(a) / float(b));
#endif
}
HOSTDEVICE inline float16 operator-(const float16& a) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hneg(half(a)));
#else
float16 res;
res.x = a.x ^ 0x8000;
return res;
#endif
}
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(half(a), half(b));
#else
return float(a) == float(b);
#endif
}
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(half(a), half(b));
#else
return float(a) != float(b);
#endif
}
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(half(a), half(b));
#else
return float(a) < float(b);
#endif
}
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(half(a), half(b));
#else
return float(a) <= float(b);
#endif
}
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(half(a), half(b));
#else
return float(a) > float(b);
#endif
}
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(half(a), half(b));
#else
return float(a) >= float(b);
#endif
}
// Arithmetic operators for float16 on ARMv8.2-A CPU
#elif defined(PADDLE_WITH_NATIVE_FP16)
inline float16 operator+(const float16& a, const float16& b) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -501,7 +616,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) { ...@@ -501,7 +616,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) {
return res; return res;
} }
HOST inline float16 operator-(const float16& a, const float16& b) { inline float16 operator-(const float16& a, const float16& b) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -517,7 +632,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) { ...@@ -517,7 +632,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) {
return res; return res;
} }
HOST inline float16 operator*(const float16& a, const float16& b) { inline float16 operator*(const float16& a, const float16& b) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -533,7 +648,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) { ...@@ -533,7 +648,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) {
return res; return res;
} }
HOST inline float16 operator/(const float16& a, const float16& b) { inline float16 operator/(const float16& a, const float16& b) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -549,7 +664,7 @@ HOST inline float16 operator/(const float16& a, const float16& b) { ...@@ -549,7 +664,7 @@ HOST inline float16 operator/(const float16& a, const float16& b) {
return res; return res;
} }
HOST inline float16 operator-(const float16& a) { inline float16 operator-(const float16& a) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -564,27 +679,27 @@ HOST inline float16 operator-(const float16& a) { ...@@ -564,27 +679,27 @@ HOST inline float16 operator-(const float16& a) {
return res; return res;
} }
HOST inline float16& operator+=(float16& a, const float16& b) { inline float16& operator+=(float16& a, const float16& b) {
a = a + b; a = a + b;
return a; return a;
} }
HOST inline float16& operator-=(float16& a, const float16& b) { inline float16& operator-=(float16& a, const float16& b) {
a = a - b; a = a - b;
return a; return a;
} }
HOST inline float16& operator*=(float16& a, const float16& b) { inline float16& operator*=(float16& a, const float16& b) {
a = a * b; a = a * b;
return a; return a;
} }
HOST inline float16& operator/=(float16& a, const float16& b) { inline float16& operator/=(float16& a, const float16& b) {
a = a / b; a = a / b;
return a; return a;
} }
HOST inline bool operator==(const float16& a, const float16& b) { inline bool operator==(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -600,11 +715,9 @@ HOST inline bool operator==(const float16& a, const float16& b) { ...@@ -600,11 +715,9 @@ HOST inline bool operator==(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
HOST inline bool operator!=(const float16& a, const float16& b) { inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
return !(a == b);
}
HOST inline bool operator<(const float16& a, const float16& b) { inline bool operator<(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v1.h}[0], [%[a_ptr]]\n" "ld1 {v1.h}[0], [%[a_ptr]]\n"
...@@ -620,7 +733,7 @@ HOST inline bool operator<(const float16& a, const float16& b) { ...@@ -620,7 +733,7 @@ HOST inline bool operator<(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
HOST inline bool operator<=(const float16& a, const float16& b) { inline bool operator<=(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v1.h}[0], [%[a_ptr]]\n" "ld1 {v1.h}[0], [%[a_ptr]]\n"
...@@ -636,7 +749,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) { ...@@ -636,7 +749,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
HOST inline bool operator>(const float16& a, const float16& b) { inline bool operator>(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -652,7 +765,7 @@ HOST inline bool operator>(const float16& a, const float16& b) { ...@@ -652,7 +765,7 @@ HOST inline bool operator>(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
HOST inline bool operator>=(const float16& a, const float16& b) { inline bool operator>=(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -668,71 +781,71 @@ HOST inline bool operator>=(const float16& a, const float16& b) { ...@@ -668,71 +781,71 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
// Arithmetic operators, software emulated on other CPU // Arithmetic operators for float16, software emulated on other CPU
#else #else
HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { inline float16 operator+(const float16& a, const float16& b) {
return float16(float(a) + float(b)); return float16(float(a) + float(b));
} }
HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { inline float16 operator-(const float16& a, const float16& b) {
return float16(float(a) - float(b)); return float16(float(a) - float(b));
} }
HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { inline float16 operator*(const float16& a, const float16& b) {
return float16(float(a) * float(b)); return float16(float(a) * float(b));
} }
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { inline float16 operator/(const float16& a, const float16& b) {
return float16(float(a) / float(b)); return float16(float(a) / float(b));
} }
HOSTDEVICE inline float16 operator-(const float16& a) { inline float16 operator-(const float16& a) {
float16 res; float16 res;
res.x = a.x ^ 0x8000; res.x = a.x ^ 0x8000;
return res; return res;
} }
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { inline float16& operator+=(float16& a, const float16& b) {
a = float16(float(a) + float(b)); a = float16(float(a) + float(b));
return a; return a;
} }
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { inline float16& operator-=(float16& a, const float16& b) {
a = float16(float(a) - float(b)); a = float16(float(a) - float(b));
return a; return a;
} }
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { inline float16& operator*=(float16& a, const float16& b) {
a = float16(float(a) * float(b)); a = float16(float(a) * float(b));
return a; return a;
} }
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { inline float16& operator/=(float16& a, const float16& b) {
a = float16(float(a) / float(b)); a = float16(float(a) / float(b));
return a; return a;
} }
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { inline bool operator==(const float16& a, const float16& b) {
return float(a) == float(b); return float(a) == float(b);
} }
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { inline bool operator!=(const float16& a, const float16& b) {
return float(a) != float(b); return float(a) != float(b);
} }
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { inline bool operator<(const float16& a, const float16& b) {
return float(a) < float(b); return float(a) < float(b);
} }
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { inline bool operator<=(const float16& a, const float16& b) {
return float(a) <= float(b); return float(a) <= float(b);
} }
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { inline bool operator>(const float16& a, const float16& b) {
return float(a) > float(b); return float(a) > float(b);
} }
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { inline bool operator>=(const float16& a, const float16& b) {
return float(a) >= float(b); return float(a) >= float(b);
} }
#endif #endif
......
...@@ -65,6 +65,18 @@ bool is_cpu_place(const Place &); ...@@ -65,6 +65,18 @@ bool is_cpu_place(const Place &);
bool places_are_same_class(const Place &, const Place &); bool places_are_same_class(const Place &, const Place &);
bool is_same_place(const Place &, const Place &); bool is_same_place(const Place &, const Place &);
struct PlaceHash {
std::size_t operator()(const Place &p) const {
constexpr size_t num_dev_bits = 4;
std::hash<int> ihash;
size_t dev_id = 0;
if (is_gpu_place(p)) {
dev_id = boost::get<CUDAPlace>(p).device;
}
return ihash(dev_id << num_dev_bits | p.which());
}
};
std::ostream &operator<<(std::ostream &, const Place &); std::ostream &operator<<(std::ostream &, const Place &);
template <typename Visitor> template <typename Visitor>
......
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(paddle_pybind SHARED if(WITH_AMD_GPU)
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc hip_library(paddle_pybind SHARED
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
${GLOB_OP_LIB}) DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
if(NOT APPLE AND NOT ANDROID) ${GLOB_OP_LIB})
target_link_libraries(paddle_pybind rt) else()
endif(NOT APPLE AND NOT ANDROID) cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID)
target_link_libraries(paddle_pybind rt)
endif(NOT APPLE AND NOT ANDROID)
endif(WITH_AMD_GPU)
endif(WITH_PYTHON) endif(WITH_PYTHON)
...@@ -29,8 +29,8 @@ Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs) ...@@ -29,8 +29,8 @@ Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs)
bool Header::Parse(std::istream& is) { bool Header::Parse(std::istream& is) {
uint32_t magic; uint32_t magic;
size_t read_size = is.read(reinterpret_cast<char*>(&magic), sizeof(uint32_t));
is.readsome(reinterpret_cast<char*>(&magic), sizeof(uint32_t)); size_t read_size = is.gcount();
if (read_size < sizeof(uint32_t)) { if (read_size < sizeof(uint32_t)) {
return false; return false;
} }
......
...@@ -28,6 +28,7 @@ Scanner::Scanner(const std::string &filename) { ...@@ -28,6 +28,7 @@ Scanner::Scanner(const std::string &filename) {
} }
void Scanner::Reset() { void Scanner::Reset() {
stream_->clear();
stream_->seekg(0, std::ios::beg); stream_->seekg(0, std::ios::beg);
ParseNextChunk(); ParseNextChunk();
} }
......
...@@ -59,17 +59,10 @@ void* lapack_dso_handle = nullptr; ...@@ -59,17 +59,10 @@ void* lapack_dso_handle = nullptr;
} __name; // struct DynLoad__##__name } __name; // struct DynLoad__##__name
#endif #endif
#ifdef PADDLE_USE_ATLAS #define PADDLE_SGETRF LAPACKE_sgetrf
#define PADDLE_SGETRF clapack_sgetrf #define PADDLE_DGETRF LAPACKE_dgetrf
#define PADDLE_DGETRF clapack_dgetrf #define PADDLE_SGETRI LAPACKE_sgetri
#define PADDLE_SGETRI clapack_sgetri #define PADDLE_DGETRI LAPACKE_dgetri
#define PADDLE_DGETRI clapack_dgetri
#else
#define PADDLE_SGETRF LAPACKE_sgetrf
#define PADDLE_DGETRF LAPACKE_dgetrf
#define PADDLE_SGETRI LAPACKE_sgetri
#define PADDLE_DGETRI LAPACKE_dgetri
#endif
#define LAPACK_ROUTINE_EACH(__macro) \ #define LAPACK_ROUTINE_EACH(__macro) \
__macro(PADDLE_SGETRF) \ __macro(PADDLE_SGETRF) \
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB) #if defined(PADDLE_USE_VECLIB)
extern "C" { extern "C" {
#include <cblas.h> #include <cblas.h>
#include <clapack.h> #include <clapack.h>
......
...@@ -35,8 +35,9 @@ function cmake_gen() { ...@@ -35,8 +35,9 @@ function cmake_gen() {
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
${PYTHON_FLAGS} ${PYTHON_FLAGS}
-DWITH_DSO=ON -DWITH_DSO=ON
-DWITH_DOC=OFF -DWITH_DOC=${WITH_DOC:-OFF}
-DWITH_GPU=${WITH_GPU:-OFF} -DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-DWITH_MKL=${WITH_MKL:-ON} -DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF} -DWITH_AVX=${WITH_AVX:-OFF}
...@@ -50,6 +51,7 @@ function cmake_gen() { ...@@ -50,6 +51,7 @@ function cmake_gen() {
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_TESTING=${WITH_TESTING:-ON}
-DWITH_FAST_BUNDLE_TEST=ON -DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
======================================== ========================================
EOF EOF
...@@ -60,8 +62,9 @@ EOF ...@@ -60,8 +62,9 @@ EOF
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
${PYTHON_FLAGS} \ ${PYTHON_FLAGS} \
-DWITH_DSO=ON \ -DWITH_DSO=ON \
-DWITH_DOC=OFF \ -DWITH_DOC=${WITH_DOC:-OFF} \
-DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_MKL=${WITH_MKL:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \
...@@ -74,6 +77,7 @@ EOF ...@@ -74,6 +77,7 @@ EOF
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \ -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \
-DWITH_FAST_BUNDLE_TEST=ON \ -DWITH_FAST_BUNDLE_TEST=ON \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
} }
...@@ -231,7 +235,7 @@ gen_capi_package ...@@ -231,7 +235,7 @@ gen_capi_package
gen_fluid_inference_lib gen_fluid_inference_lib
if [[ ${WITH_C_API:-OFF} == "ON" ]]; then if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
else else
printf "If you need to install PaddlePaddle in develop docker image," printf "If you need to install PaddlePaddle in develop docker image,"
printf "please make install or pip install build/python/dist/*.whl.\n" printf "please make install or pip install build/python/dist/*.whl.\n"
......
#!/bin/bash
docker run --rm \
-v $(git rev-parse --show-toplevel):/paddle \
-e "WITH_GPU=OFF" \
-e "WITH_AVX=ON" \
-e "WITH_DOC=ON" \
-e "WOBOQ=ON" \
${1:-"paddlepaddle/paddle:latest-dev"}
...@@ -12,7 +12,8 @@ ...@@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from layers.control_flow import BlockGuard, Select from layers.control_flow import BlockGuard, equal
from .framework import Operator
from layer_helper import LayerHelper, unique_name from layer_helper import LayerHelper, unique_name
from layers import fill_constant from layers import fill_constant
import core import core
...@@ -75,6 +76,206 @@ class Go(BlockGuard): ...@@ -75,6 +76,206 @@ class Go(BlockGuard):
attrs={'sub_block': go_block}) attrs={'sub_block': go_block})
class SelectCase(object):
DEFAULT = 0
SEND = 1
RECEIVE = 2
def __init__(self,
select,
case_idx,
case_to_execute,
channel_action_fn=None,
channel=None,
value=None,
is_copy=False):
self.select = select
self.helper = LayerHelper('conditional_block')
self.main_program = self.helper.main_program
self.is_scalar_condition = True
self.case_to_execute = case_to_execute
self.idx = case_idx
# Since we aren't going to use the `channel_send` or `channel_recv`
# functions directly, we just need to capture the name.
self.action = (self.SEND
if channel_action_fn.__name__ == ('channel_send') else
self.RECEIVE) if channel_action_fn else self.DEFAULT
X = value
if self.action == self.SEND and is_copy:
# We create of copy of the data we want to send
copied_X = self.select.parent_block.create_var(
name=unique_name.generate(value.name + '_copy'),
type=value.type,
dtype=value.dtype,
shape=value.shape,
lod_level=value.lod_level,
capacity=value.capacity
if hasattr(value, 'capacity') else None, )
self.select.parent_block.append_op(
type="assign", inputs={"X": value}, outputs={"Out": copied_X})
X = copied_X
self.value = X
self.channel = channel
def __enter__(self):
self.block = self.main_program.create_block()
def construct_op(self):
main_program = self.helper.main_program
cases_block = main_program.current_block()
inner_outputs = set()
input_set = set()
params = set()
for op in self.block.ops:
# Iterate over all operators, get all the inputs
# and add as input to the SelectCase operator.
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in inner_outputs:
input_set.add(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
inner_outputs.add(out_var_name)
param_list = [
cases_block.var(each_name) for each_name in params
if each_name not in input_set
]
# Iterate over all operators, get all the outputs
# add to the output list of SelectCase operator only if
# they exist in the parent block.
out_vars = []
for inner_out_name in inner_outputs:
if inner_out_name in cases_block.vars:
out_vars.append(cases_block.var(inner_out_name))
# First, create an op that will determine whether or not this is the
# conditional variable to execute.
should_execute_block = equal(
fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
self.case_to_execute)
step_scope = cases_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
cases_block.append_op(
type='conditional_block',
inputs={'X': [should_execute_block],
'Params': param_list},
outputs={'Out': out_vars,
'Scope': [step_scope]},
attrs={
'sub_block': self.block,
'is_scalar_condition': self.is_scalar_condition
})
return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
if self.channel else '', self.value.name
if self.value else '')
def __exit__(self, exc_type, exc_val, exc_tb):
self.main_program.rollback()
if exc_type is not None:
return False # re-raise exception
return True
class Select(BlockGuard):
def __init__(self, name=None):
self.helper = LayerHelper('select', name=name)
self.parent_block = self.helper.main_program.current_block()
self.cases = []
super(Select, self).__init__(self.helper.main_program)
self.case_to_execute = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
def __enter__(self):
super(Select, self).__enter__()
return self
def case(self, channel_action_fn, channel, value, is_copy=False):
"""Create a new block for this condition.
"""
select_case = SelectCase(self,
len(self.cases), self.case_to_execute,
channel_action_fn, channel, value, is_copy)
self.cases.append(select_case)
return select_case
def default(self):
"""Create a default case block for this condition.
"""
default_case = SelectCase(self, len(self.cases), self.case_to_execute)
self.cases.append(default_case)
return default_case
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
# Create a select op and another block to wrap its
# case blocks.
select_block = self.helper.main_program.current_block()
parent_block = self.helper.main_program.block(select_block.parent_idx)
# Construct each case op, inside the newly created select block.
serialized_cases = []
for case in self.cases:
serialized_cases.append(case.construct_op())
intermediate = set()
params = set()
for case_block in select_block.ops:
if case_block.attrs and 'sub_block' in case_block.attrs:
for each_op in case_block.attrs['sub_block'].ops:
assert isinstance(each_op, Operator)
for iname in each_op.input_names:
for in_var_name in each_op.input(iname):
if in_var_name not in intermediate:
params.add(in_var_name)
for oname in each_op.output_names:
for out_var_name in each_op.output(oname):
intermediate.add(out_var_name)
out_list = [
parent_block.var(var_name) for var_name in parent_block.vars
if var_name in intermediate
]
X = [select_block.var_recursive(x_name) for x_name in params]
# Needs to be used by `equal` inside the cases block.
X.append(self.case_to_execute)
# Construct the select op.
parent_block.append_op(
type='select',
inputs={'X': X,
'case_to_execute': self.case_to_execute},
attrs={'sub_block': select_block,
'cases': serialized_cases},
outputs={'Out': out_list})
return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
def make_channel(dtype, capacity=0): def make_channel(dtype, capacity=0):
""" """
Helps implementation of a concurrent program by creating a "channel" of Helps implementation of a concurrent program by creating a "channel" of
...@@ -131,7 +332,7 @@ def make_channel(dtype, capacity=0): ...@@ -131,7 +332,7 @@ def make_channel(dtype, capacity=0):
return channel return channel
def channel_send(channel, value, copy=False): def channel_send(channel, value, is_copy=False):
""" """
Sends a value through a channel variable. Used by an unbuffered or buffered Sends a value through a channel variable. Used by an unbuffered or buffered
channel to pass data from within or to a concurrent Go block, where channel to pass data from within or to a concurrent Go block, where
...@@ -141,8 +342,8 @@ def channel_send(channel, value, copy=False): ...@@ -141,8 +342,8 @@ def channel_send(channel, value, copy=False):
channel (Variable|Channel): Channel variable created using channel (Variable|Channel): Channel variable created using
`make_channel`. `make_channel`.
value (Variable): Value to send to channel value (Variable): Value to send to channel
copy (bool): Copy data while channel send. If False, then data is_copy (bool): Copy data while channel send. If False, then data
is moved. The input cannot be used after move. is moved. The input cannot be used after move. (default False)
Returns: Returns:
Variable: The boolean status on whether or not the channel Variable: The boolean status on whether or not the channel
successfully sent the passed value. successfully sent the passed value.
...@@ -159,35 +360,26 @@ def channel_send(channel, value, copy=False): ...@@ -159,35 +360,26 @@ def channel_send(channel, value, copy=False):
main_program = helper.main_program main_program = helper.main_program
channel_send_block = main_program.current_block() channel_send_block = main_program.current_block()
status = helper.create_variable(
name=unique_name.generate('status'),
type=core.VarDesc.VarType.LOD_TENSOR,
dtype=core.VarDesc.VarType.BOOL)
X = value X = value
if copy is True: if is_copy:
copied_X = helper.create_variable( copied_X = helper.create_variable(
name=unique_name.generate(value.name + '_copy'), name=unique_name.generate(value.name + '_copy'),
type=value.type, type=value.type,
dtype=value.dtype, dtype=value.dtype,
shape=value.shape, shape=value.shape,
lod_level=value.lod_level, lod_level=value.lod_level,
capacity=value.capacity) capacity=value.capacity if hasattr(value, 'capacity') else None)
assign_op = channel_send_block.append_op( assign_op = channel_send_block.append_op(
type="assign_op", inputs={"X": value}, outputs={"Out": copied_X}) type="assign", inputs={"X": value}, outputs={"Out": copied_X})
X = copied_X X = copied_X
channel_send_op = channel_send_block.append_op( channel_send_block.append_op(
type="channel_send", type="channel_send", inputs={
inputs={
"Channel": channel, "Channel": channel,
"X": X, "X": X,
}, })
outputs={"Status": status})
return status
def channel_recv(channel, return_value): def channel_recv(channel, return_value):
......
...@@ -16,7 +16,6 @@ import sys ...@@ -16,7 +16,6 @@ import sys
import re import re
from graphviz import GraphPreviewGenerator from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2 import proto.framework_pb2 as framework_pb2
import paddle.fluid.core as core
_vartype2str_ = [ _vartype2str_ = [
"UNK", "UNK",
...@@ -126,7 +125,6 @@ def pprint_block_codes(block_desc, show_backward=False): ...@@ -126,7 +125,6 @@ def pprint_block_codes(block_desc, show_backward=False):
def is_var_backward(var_desc): def is_var_backward(var_desc):
return "@GRAD" in var_desc.name return "@GRAD" in var_desc.name
#print(type(block_desc))
if type(block_desc) is not framework_pb2.BlockDesc: if type(block_desc) is not framework_pb2.BlockDesc:
block_desc = framework_pb2.BlockDesc.FromString( block_desc = framework_pb2.BlockDesc.FromString(
block_desc.serialize_to_string()) block_desc.serialize_to_string())
......
...@@ -20,6 +20,7 @@ from layer_helper import LayerHelper ...@@ -20,6 +20,7 @@ from layer_helper import LayerHelper
from distributed_spliter import * from distributed_spliter import *
import math import math
from . import core from . import core
import debuger
class VarBlock: class VarBlock:
...@@ -289,6 +290,7 @@ class DistributeTranspiler: ...@@ -289,6 +290,7 @@ class DistributeTranspiler:
dtype=v.dtype, dtype=v.dtype,
shape=v.shape) shape=v.shape)
recv_inputs.append(var) recv_inputs.append(var)
# step3 # step3
optimize_block = pserver_program.create_block(0) optimize_block = pserver_program.create_block(0)
# step 4 # step 4
...@@ -307,15 +309,57 @@ class DistributeTranspiler: ...@@ -307,15 +309,57 @@ class DistributeTranspiler:
# Iterate through the ops, and if an op and the optimize ops # Iterate through the ops, and if an op and the optimize ops
# which located on current pserver are in one set, then # which located on current pserver are in one set, then
# append it into the sub program. # append it into the sub program.
for _, op in enumerate(self.optimize_ops):
for _, opt_op in enumerate(opt_op_on_pserver): # We try to put optimization program run parallelly, assume
if ufind.is_connected(op, opt_op): # optimization program always looks like:
if self._is_opt_op(op): #
self._append_pserver_ops(optimize_block, op, endpoint, # prevop -> prevop -> opt op -> following op -> following op; ->
default_main_program()) # prevop -> prevop -> opt op -> following op -> following op; ->
else: # global op -> global op
self._append_pserver_non_opt_ops(optimize_block, op) #
break # we put operators that can run parallelly to many program blocks.
# in above example, we seperate ops by the ";". Global ops must run
# after all the optimize ops finished.
global_ops = []
# HACK: optimization global ops only used to scale beta1 and beta2
# replace it with dependency engine.
for op in self.optimize_ops:
if op.type == "scale":
for in_name in op.input_arg_names:
if in_name.startswith("beta1_pow_acc") or\
in_name.startswith("beta2_pow_acc"):
global_ops.append(op)
def __append_optimize_op__(op, block):
if self._is_opt_op(op):
self._append_pserver_ops(block, op, endpoint,
default_main_program())
else:
self._append_pserver_non_opt_ops(block, op)
# append op to the current block
per_opt_block = optimize_block
for _, opt_op in enumerate(opt_op_on_pserver):
for _, op in enumerate(self.optimize_ops):
# optimizer is connected to itself
if ufind.is_connected(op, opt_op) and \
op not in global_ops:
__append_optimize_op__(op, per_opt_block)
per_opt_block = pserver_program.create_block(0)
# append global ops
for glb_op in global_ops:
__append_optimize_op__(glb_op, per_opt_block)
# NOT USED: single block version:
#
# for _, op in enumerate(self.optimize_ops):
# for _, opt_op in enumerate(opt_op_on_pserver):
# if ufind.is_connected(op, opt_op):
# __append_optimize_op__(glb_op, optimize_block)
# break
# step5 append the listen_and_serv op # step5 append the listen_and_serv op
pserver_program.global_block().append_op( pserver_program.global_block().append_op(
type="listen_and_serv", type="listen_and_serv",
...@@ -521,6 +565,8 @@ class DistributeTranspiler: ...@@ -521,6 +565,8 @@ class DistributeTranspiler:
orig_var_name = "" orig_var_name = ""
if suff_idx >= 0: if suff_idx >= 0:
orig_var_name = varname[:suff_idx] orig_var_name = varname[:suff_idx]
else:
orig_var_name = varname
return orig_var_name return orig_var_name
def _append_pserver_ops(self, optimize_block, opt_op, endpoint, def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
...@@ -535,7 +581,8 @@ class DistributeTranspiler: ...@@ -535,7 +581,8 @@ class DistributeTranspiler:
grad_block = None grad_block = None
for g in self.param_grad_ep_mapping[endpoint]["grads"]: for g in self.param_grad_ep_mapping[endpoint]["grads"]:
if same_or_split_var( if same_or_split_var(
self._orig_varname(g.name), opt_op.input(key)[0]): self._orig_varname(g.name),
self._orig_varname(opt_op.input(key)[0])):
grad_block = g grad_block = g
break break
if not grad_block: if not grad_block:
...@@ -660,10 +707,22 @@ class DistributeTranspiler: ...@@ -660,10 +707,22 @@ class DistributeTranspiler:
# If one op's input is another op's output or # If one op's input is another op's output or
# one op's output is another op's input, we say # one op's output is another op's input, we say
# the two operator is connected. # the two operator is connected.
op1_input_names = op1.desc.input_arg_names() def _append_inname_remove_beta(varname_list):
op_input_names = []
for in_name in varname_list:
# HACK: remove beta1 and beta2 to avoid let all
# ops connected.
if in_name.startswith("beta2_pow_acc") or \
in_name.startswith("beta1_pow_acc"):
continue
else:
op_input_names.append(in_name)
return op_input_names
op1_input_names = _append_inname_remove_beta(op1.desc.input_arg_names())
op1_output_names = op1.desc.output_arg_names() op1_output_names = op1.desc.output_arg_names()
op2_input_names = op2.desc.input_arg_names() op2_input_names = _append_inname_remove_beta(op2.desc.input_arg_names())
op2_output_names = op2.desc.output_arg_names() op2_output_names = op2.desc.output_arg_names()
if set(op1_output_names) & set(op2_input_names) or \ if set(op1_output_names) & set(op2_input_names) or \
...@@ -694,7 +753,7 @@ class DistributeTranspiler: ...@@ -694,7 +753,7 @@ class DistributeTranspiler:
param_names = [ param_names = [
p.name for p in self.param_grad_ep_mapping[endpoint]["params"] p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
] ]
if op.input("Param") in param_names: if op.input("Param")[0] in param_names:
return True return True
else: else:
for n in param_names: for n in param_names:
......
...@@ -235,6 +235,77 @@ class Executor(object): ...@@ -235,6 +235,77 @@ class Executor(object):
tensor.set_lod(lod) tensor.set_lod(lod)
return tensor return tensor
def _get_program_cache(self, program_cache_key):
return self.program_caches.get(program_cache_key, None)
def _add_program_cache(self, program_cache_key, program):
self.program_caches[program_cache_key] = program
def _add_feed_fetch_ops(self, program, feed, fetch_list, feed_var_name,
fetch_var_name):
tmp_program = program.clone()
global_block = tmp_program.global_block()
if feed_var_name in global_block.vars:
feed_var = global_block.var(feed_var_name)
else:
feed_var = global_block.create_var(
name=feed_var_name,
type=core.VarDesc.VarType.FEED_MINIBATCH,
persistable=True)
if fetch_var_name in global_block.vars:
fetch_var = global_block.var(fetch_var_name)
else:
fetch_var = global_block.create_var(
name=fetch_var_name,
type=core.VarDesc.VarType.FETCH_LIST,
persistable=True)
# prepend feed operators
if not has_feed_operators(global_block, feed, feed_var_name):
for i, name in enumerate(feed):
out = global_block.var(name)
global_block.prepend_op(
type='feed',
inputs={'X': [feed_var]},
outputs={'Out': [out]},
attrs={'col': i})
# append fetch_operators
if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
for i, var in enumerate(fetch_list):
assert isinstance(var, Variable) or isinstance(var, str), (
"Wrong type for fetch_list[%s]: %s" % (i, type(var)))
global_block.append_op(
type='fetch',
inputs={'X': [var]},
outputs={'Out': [fetch_var]},
attrs={'col': i})
return tmp_program
def _feed_data(self, program, feed, feed_var_name, scope):
# feed var to framework
for op in program.global_block().ops:
if op.desc.type() == 'feed':
feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name]
if not isinstance(cur_feed, core.LoDTensor):
cur_feed = self.aslodtensor(cur_feed)
idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else:
break
def _fetch_data(self, fetch_list, fetch_var_name, scope):
outs = [
core.get_fetch_variable(scope, fetch_var_name, i)
for i in xrange(len(fetch_list))
]
return outs
def run(self, def run(self,
program=None, program=None,
feed=None, feed=None,
...@@ -268,7 +339,6 @@ class Executor(object): ...@@ -268,7 +339,6 @@ class Executor(object):
raise TypeError("feed should be a map") raise TypeError("feed should be a map")
if fetch_list is None: if fetch_list is None:
fetch_list = [] fetch_list = []
if program is None: if program is None:
program = default_main_program() program = default_main_program()
...@@ -278,79 +348,30 @@ class Executor(object): ...@@ -278,79 +348,30 @@ class Executor(object):
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
program_cache = None cache_key = get_program_cache_key(feed, fetch_list)
program_cache_key = get_program_cache_key(feed, fetch_list)
if use_program_cache: if use_program_cache:
# find program cache by cache_key cached_program = self._get_program_cache(cache_key)
program_cache = self.program_caches.get(program_cache_key, None) if cached_program is None:
# TODO(qiao): Should check program_cache and program are exactly the same. cached_program = self._add_feed_fetch_ops(
program=program,
feed=feed,
fetch_list=fetch_list,
feed_var_name=feed_var_name,
fetch_var_name=fetch_var_name)
self._add_program_cache(cache_key, cached_program)
program = cached_program
else: else:
self.program_caches.pop(program_cache_key, None) self.program_caches.pop(cache_key, None)
program = self._add_feed_fetch_ops(
if program_cache is None: program=program,
program_cache = program.clone() feed=feed,
fetch_list=fetch_list,
if use_program_cache: feed_var_name=feed_var_name,
self.program_caches[program_cache_key] = program_cache fetch_var_name=fetch_var_name)
global_block = program_cache.global_block() self._feed_data(program, feed, feed_var_name, scope)
self.executor.run(program.desc, scope, 0, True, True)
if feed_var_name in global_block.vars: outs = self._fetch_data(fetch_list, fetch_var_name, scope)
feed_var = global_block.var(feed_var_name)
else:
feed_var = global_block.create_var(
name=feed_var_name,
type=core.VarDesc.VarType.FEED_MINIBATCH,
persistable=True)
if fetch_var_name in global_block.vars:
fetch_var = global_block.var(fetch_var_name)
else:
fetch_var = global_block.create_var(
name=fetch_var_name,
type=core.VarDesc.VarType.FETCH_LIST,
persistable=True)
# prepend feed operators
if not has_feed_operators(global_block, feed, feed_var_name):
for i, name in enumerate(feed):
out = global_block.var(name)
global_block.prepend_op(
type='feed',
inputs={'X': [feed_var]},
outputs={'Out': [out]},
attrs={'col': i})
# append fetch_operators
if not has_fetch_operators(global_block, fetch_list,
fetch_var_name):
for i, var in enumerate(fetch_list):
assert isinstance(var, Variable) or isinstance(var, str), (
"Wrong type for fetch_list[%s]: %s" % (i, type(var)))
global_block.append_op(
type='fetch',
inputs={'X': [var]},
outputs={'Out': [fetch_var]},
attrs={'col': i})
# feed var to framework
for op in program_cache.global_block().ops:
if op.desc.type() == 'feed':
feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name]
if not isinstance(cur_feed, core.LoDTensor):
cur_feed = self.aslodtensor(cur_feed)
idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else:
break
self.executor.run(program_cache.desc, scope, 0, True, True)
outs = [
core.get_fetch_variable(scope, fetch_var_name, i)
for i in xrange(len(fetch_list))
]
if return_numpy: if return_numpy:
outs = as_numpy(outs) outs = as_numpy(outs)
return outs return outs
...@@ -918,6 +918,24 @@ class Block(object): ...@@ -918,6 +918,24 @@ class Block(object):
name=v.name) name=v.name)
self.vars[new_p.name] = new_p self.vars[new_p.name] = new_p
def clone_variable(self, var):
"""
Clone a variable into current block.
Args:
var: the variable to be cloned.
Returns:
The new variable cloned from 'var' in current block.
"""
assert isinstance(var, Variable)
return self.create_var(
name=var.name,
shape=var.shape,
dtype=var.dtype,
type=var.type,
lod_level=var.lod_level,
persistable=True)
class Program(object): class Program(object):
def __init__(self): def __init__(self):
...@@ -960,14 +978,14 @@ class Program(object): ...@@ -960,14 +978,14 @@ class Program(object):
"""Clone the Program object """Clone the Program object
Set for_test to False when we want to clone the program for training. Set for_test to False when we want to clone the program for training.
Set for_test to True when we want to clone the program for testing. Set for_test to True when we want to clone the program for testing.
Args: Args:
for_test(bool): Some operators, such as batch_norm and drop_out ops, for_test(bool): Some operators, such as batch_norm and drop_out ops,
behave differently in training and testing. If for_test is True, behave differently in training and testing. If for_test is True,
the is_test attributes in these operators will be set to True for the is_test attributes in these operators will be set to True for
testing purposes, otherwise, they remain unchanged. testing purposes, otherwise, they remain unchanged.
Returns(Program): Returns(Program):
The cloned Program object. The cloned Program object.
""" """
......
...@@ -399,7 +399,12 @@ class LayerHelper(object): ...@@ -399,7 +399,12 @@ class LayerHelper(object):
if isinstance(act, basestring): if isinstance(act, basestring):
act = {'type': act} act = {'type': act}
tmp = self.create_tmp_variable(dtype=input_var.dtype) tmp = self.create_tmp_variable(dtype=input_var.dtype)
if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
act_type = act.pop('type') act_type = act.pop('type')
if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
self.append_op( self.append_op(
type=act_type, type=act_type,
inputs={"X": [input_var]}, inputs={"X": [input_var]},
......
...@@ -16,7 +16,7 @@ import contextlib ...@@ -16,7 +16,7 @@ import contextlib
from layer_function_generator import autodoc from layer_function_generator import autodoc
from tensor import assign, fill_constant from tensor import assign, fill_constant
from .. import core from .. import core
from ..framework import Program, Variable, Operator, Block from ..framework import Program, Variable, Operator
from ..layer_helper import LayerHelper, unique_name from ..layer_helper import LayerHelper, unique_name
from ops import logical_and, logical_not, logical_or from ops import logical_and, logical_not, logical_or
...@@ -29,7 +29,6 @@ __all__ = [ ...@@ -29,7 +29,6 @@ __all__ = [
'WhileGuard', 'WhileGuard',
'While', 'While',
'Switch', 'Switch',
'Select',
'lod_rank_table', 'lod_rank_table',
'max_sequence_len', 'max_sequence_len',
'topk', 'topk',
...@@ -1212,186 +1211,6 @@ class Switch(object): ...@@ -1212,186 +1211,6 @@ class Switch(object):
return True return True
class SelectCase(object):
DEFAULT = 0
SEND = 1
RECEIVE = 2
def __init__(self,
case_idx,
case_to_execute,
channel_action_fn=None,
channel=None,
value=None):
self.helper = LayerHelper('conditional_block')
self.main_program = self.helper.main_program
self.is_scalar_condition = True
self.case_to_execute = case_to_execute
self.idx = case_idx
# Since we aren't going to use the `channel_send` or `channel_recv`
# functions directly, we just need to capture the name.
self.action = (self.SEND
if channel_action_fn.__name__ == ('channel_send') else
self.RECEIVE) if channel_action_fn else (self.DEFAULT)
self.value = value
self.channel = channel
def __enter__(self):
self.block = self.main_program.create_block()
def construct_op(self):
main_program = self.helper.main_program
cases_block = main_program.current_block()
inner_outputs = set()
input_set = set()
params = set()
for op in self.block.ops:
# Iterate over all operators, get all the inputs
# and add as input to the SelectCase operator.
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in inner_outputs:
input_set.add(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
inner_outputs.add(out_var_name)
param_list = [
cases_block.var(each_name) for each_name in params
if each_name not in input_set
]
# Iterate over all operators, get all the outputs
# add to the output list of SelectCase operator only if
# they exist in the parent block.
out_vars = []
for inner_out_name in inner_outputs:
if inner_out_name in cases_block.vars:
out_vars.append(cases_block.var(inner_out_name))
# First, create an op that will determine whether or not this is the
# conditional variable to execute.
should_execute_block = equal(
fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
self.case_to_execute)
step_scope = cases_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
cases_block.append_op(
type='conditional_block',
inputs={'X': [should_execute_block],
'Params': param_list},
outputs={'Out': out_vars,
'Scope': [step_scope]},
attrs={
'sub_block': self.block,
'is_scalar_condition': self.is_scalar_condition
})
return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
if self.channel else '', self.value.name
if self.value else '')
def __exit__(self, exc_type, exc_val, exc_tb):
self.main_program.rollback()
if exc_type is not None:
return False # re-raise exception
return True
class Select(BlockGuard):
def __init__(self, name=None):
self.helper = LayerHelper('select', name=name)
self.cases = []
super(Select, self).__init__(self.helper.main_program)
self.case_to_execute = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
def __enter__(self):
super(Select, self).__enter__()
return self
def case(self, channel_action_fn, channel, value):
"""Create a new block for this condition.
"""
select_case = SelectCase(
len(self.cases), self.case_to_execute, channel_action_fn, channel,
value)
self.cases.append(select_case)
return select_case
def default(self):
"""Create a default case block for this condition.
"""
default_case = SelectCase(len(self.cases), self.case_to_execute)
self.cases.append(default_case)
return default_case
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
# Create a select op and another block to wrap its
# case blocks.
select_block = self.helper.main_program.current_block()
parent_block = self.helper.main_program.block(select_block.parent_idx)
# Construct each case op, inside the newly created select block.
serialized_cases = []
for case in self.cases:
serialized_cases.append(case.construct_op())
intermediate = set()
params = set()
for case_block in select_block.ops:
if case_block.attrs and 'sub_block' in case_block.attrs:
for each_op in case_block.attrs['sub_block'].ops:
assert isinstance(each_op, Operator)
for iname in each_op.input_names:
for in_var_name in each_op.input(iname):
if in_var_name not in intermediate:
params.add(in_var_name)
for oname in each_op.output_names:
for out_var_name in each_op.output(oname):
intermediate.add(out_var_name)
# TODO(varunarora): Figure out if defining output is needed.
out_list = [
parent_block.var(var_name) for var_name in parent_block.vars
if var_name in intermediate
]
X = [select_block.var_recursive(x_name) for x_name in params]
# Needs to be used by `equal` inside the cases block.
X.append(self.case_to_execute)
# Construct the select op.
parent_block.append_op(
type='select',
inputs={'X': X,
'case_to_execute': self.case_to_execute},
attrs={'sub_block': select_block,
'cases': serialized_cases},
outputs={})
return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
class IfElseBlockGuard(object): class IfElseBlockGuard(object):
def __init__(self, is_true, ifelse): def __init__(self, is_true, ifelse):
if not isinstance(ifelse, IfElse): if not isinstance(ifelse, IfElse):
......
...@@ -129,13 +129,12 @@ def detection_output(loc, ...@@ -129,13 +129,12 @@ def detection_output(loc,
prior_box_var=prior_box_var, prior_box_var=prior_box_var,
target_box=loc, target_box=loc,
code_type='decode_center_size') code_type='decode_center_size')
old_shape = scores.shape old_shape = scores.shape
scores = ops.reshape(x=scores, shape=(-1, old_shape[-1])) scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
scores = nn.softmax(input=scores) scores = nn.softmax(input=scores)
scores = ops.reshape(x=scores, shape=old_shape) scores = ops.reshape(x=scores, shape=old_shape)
scores = nn.transpose(scores, perm=[0, 2, 1]) scores = nn.transpose(scores, perm=[0, 2, 1])
scores.stop_gradient = True
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
helper.append_op( helper.append_op(
type="multiclass_nms", type="multiclass_nms",
...@@ -150,6 +149,7 @@ def detection_output(loc, ...@@ -150,6 +149,7 @@ def detection_output(loc,
'score_threshold': score_threshold, 'score_threshold': score_threshold,
'nms_eta': 1.0 'nms_eta': 1.0
}) })
nmsed_outs.stop_gradient = True
return nmsed_outs return nmsed_outs
...@@ -475,6 +475,7 @@ def ssd_loss(location, ...@@ -475,6 +475,7 @@ def ssd_loss(location,
# 2. Compute confidence for mining hard examples # 2. Compute confidence for mining hard examples
# 2.1. Get the target label based on matched indices # 2.1. Get the target label based on matched indices
gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, )) gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
gt_label.stop_gradient = True
target_label, _ = target_assign( target_label, _ = target_assign(
gt_label, matched_indices, mismatch_value=background_label) gt_label, matched_indices, mismatch_value=background_label)
# 2.2. Compute confidence loss. # 2.2. Compute confidence loss.
...@@ -482,10 +483,12 @@ def ssd_loss(location, ...@@ -482,10 +483,12 @@ def ssd_loss(location,
confidence = __reshape_to_2d(confidence) confidence = __reshape_to_2d(confidence)
target_label = tensor.cast(x=target_label, dtype='int64') target_label = tensor.cast(x=target_label, dtype='int64')
target_label = __reshape_to_2d(target_label) target_label = __reshape_to_2d(target_label)
target_label.stop_gradient = True
conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
# 3. Mining hard examples # 3. Mining hard examples
conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior)) conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
conf_loss.stop_gradient = True
neg_indices = helper.create_tmp_variable(dtype='int32') neg_indices = helper.create_tmp_variable(dtype='int32')
dtype = matched_indices.dtype dtype = matched_indices.dtype
updated_matched_indices = helper.create_tmp_variable(dtype=dtype) updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
...@@ -695,6 +698,8 @@ def multi_box_head(inputs, ...@@ -695,6 +698,8 @@ def multi_box_head(inputs,
outputs={"Boxes": box, outputs={"Boxes": box,
"Variances": var}, "Variances": var},
attrs=attrs, ) attrs=attrs, )
box.stop_gradient = True
var.stop_gradient = True
return box, var return box, var
def _reshape_with_axis_(input, axis=1): def _reshape_with_axis_(input, axis=1):
...@@ -834,4 +839,6 @@ def multi_box_head(inputs, ...@@ -834,4 +839,6 @@ def multi_box_head(inputs,
mbox_locs_concat = tensor.concat(mbox_locs, axis=1) mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
mbox_confs_concat = tensor.concat(mbox_confs, axis=1) mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
box.stop_gradient = True
var.stop_gradient = True
return mbox_locs_concat, mbox_confs_concat, box, var return mbox_locs_concat, mbox_confs_concat, box, var
...@@ -21,7 +21,8 @@ from ..executor import global_scope ...@@ -21,7 +21,8 @@ from ..executor import global_scope
__all__ = [ __all__ = [
'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file', 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
'read_file', 'create_shuffle_reader', 'create_double_buffer_reader' 'open_files', 'read_file', 'create_shuffle_reader',
'create_double_buffer_reader', 'create_multi_pass_reader'
] ]
...@@ -112,9 +113,9 @@ class ListenAndServ(object): ...@@ -112,9 +113,9 @@ class ListenAndServ(object):
which can receive variables from clients and run a block. which can receive variables from clients and run a block.
""" """
def __init__(self, endpoint, fan_in=1, optimizer_mode=True): def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True):
self.helper = LayerHelper("listen_and_serv") self.helper = LayerHelper("listen_and_serv")
self.inputs = [] self.inputs = inputs
self.outputs = [] self.outputs = []
self.endpoint = endpoint self.endpoint = endpoint
self.fan_in = fan_in self.fan_in = fan_in
...@@ -159,18 +160,13 @@ class ListenAndServ(object): ...@@ -159,18 +160,13 @@ class ListenAndServ(object):
current_block = main_program.current_block() current_block = main_program.current_block()
parent_block = self.parent_block() parent_block = self.parent_block()
params, grads = self.get_params_and_grads()
param_names = [p.name for p in params]
grad_names = [g.name for g in grads]
parent_block.append_op( parent_block.append_op(
type='listen_and_serv', type='listen_and_serv',
inputs={}, inputs={"X": self.inputs},
outputs={}, outputs={},
attrs={ attrs={
'endpoint': self.endpoint, 'endpoint': self.endpoint,
'Fanin': self.fan_in, 'Fanin': self.fan_in,
'ParamList': param_names,
'GradList': grad_names,
'OptimizeBlock': current_block 'OptimizeBlock': current_block
}) })
...@@ -195,10 +191,14 @@ def Send(endpoints, send_vars, get_vars): ...@@ -195,10 +191,14 @@ def Send(endpoints, send_vars, get_vars):
endpoints = list(set(epmap)) endpoints = list(set(epmap))
helper = LayerHelper("Send", **locals()) helper = LayerHelper("Send", **locals())
rpc_client_var = default_main_program().global_block().create_var(
name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
helper.append_op( helper.append_op(
type="send", type="send",
inputs={"X": send_vars}, inputs={"X": send_vars},
outputs={"Out": get_vars}, outputs={"Out": get_vars,
"RPCClient": rpc_client_var},
attrs={"endpoints": endpoints, attrs={"endpoints": endpoints,
"epmap": epmap}) "epmap": epmap})
...@@ -287,6 +287,36 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes): ...@@ -287,6 +287,36 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
startup_var) startup_var)
def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
shape_concat = []
ranks = []
for shape in shapes:
shape_concat.extend(shape)
ranks.append(len(shape))
var_name = unique_name('multiple_reader')
startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=var_name)
startup_blk.append_op(
type='open_files',
outputs={'Out': [startup_var]},
attrs={
'shape_concat': shape_concat,
'lod_levels': lod_levels,
'ranks': ranks,
'file_names': filenames,
'thread_num': thread_num
})
startup_var.desc.set_dtypes(dtypes)
startup_var.persistable = True
return _copy_reader_var_(default_main_program().current_block(),
startup_var)
def __create_decorated_reader__(op_type, reader, attrs): def __create_decorated_reader__(op_type, reader, attrs):
var_name = unique_name(op_type) var_name = unique_name(op_type)
startup_blk = default_startup_program().current_block() startup_blk = default_startup_program().current_block()
...@@ -314,6 +344,11 @@ def create_double_buffer_reader(reader, place=None): ...@@ -314,6 +344,11 @@ def create_double_buffer_reader(reader, place=None):
attrs) attrs)
def create_multi_pass_reader(reader, pass_num):
return __create_decorated_reader__('create_multi_pass_reader', reader,
{'pass_num': int(pass_num)})
def read_file(file_obj): def read_file(file_obj):
helper = LayerHelper('read_file') helper = LayerHelper('read_file')
out = [ out = [
......
...@@ -74,6 +74,7 @@ __all__ = [ ...@@ -74,6 +74,7 @@ __all__ = [
'one_hot', 'one_hot',
'autoincreased_step_counter', 'autoincreased_step_counter',
'lod_reset', 'lod_reset',
'lrn',
] ]
...@@ -82,6 +83,7 @@ def fc(input, ...@@ -82,6 +83,7 @@ def fc(input,
num_flatten_dims=1, num_flatten_dims=1,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
use_mkldnn=False,
act=None, act=None,
name=None): name=None):
""" """
...@@ -163,8 +165,11 @@ def fc(input, ...@@ -163,8 +165,11 @@ def fc(input,
inputs={"X": input_var, inputs={"X": input_var,
"Y": w}, "Y": w},
outputs={"Out": tmp}, outputs={"Out": tmp},
attrs={"x_num_col_dims": num_flatten_dims, attrs={
"y_num_col_dims": 1}) "x_num_col_dims": num_flatten_dims,
"y_num_col_dims": 1,
'use_mkldnn': use_mkldnn
})
mul_results.append(tmp) mul_results.append(tmp)
# sum # sum
...@@ -1117,12 +1122,14 @@ def conv2d(input, ...@@ -1117,12 +1122,14 @@ def conv2d(input,
filter_size, filter_size,
stride=1, stride=1,
padding=0, padding=0,
dilation=1,
groups=None, groups=None,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
use_cudnn=True, use_cudnn=True,
use_mkldnn=False, use_mkldnn=False,
act=None): act=None,
name=None):
""" """
**Convlution2D Layer** **Convlution2D Layer**
...@@ -1183,6 +1190,9 @@ def conv2d(input, ...@@ -1183,6 +1190,9 @@ def conv2d(input,
padding(int|tuple): The padding size. If padding is a tuple, it must padding(int|tuple): The padding size. If padding is a tuple, it must
contain two integers, (padding_H, padding_W). Otherwise, the contain two integers, (padding_H, padding_W). Otherwise, the
padding_H = padding_W = padding. Default: padding = 0. padding_H = padding_W = padding. Default: padding = 0.
dilation(int|tuple): The dilation size. If dilation is a tuple, it must
contain two integers, (dilation_H, dilation_W). Otherwise, the
dilation_H = dilation_W = dilation. Default: dilation = 1.
groups(int): The groups number of the Conv2d Layer. According to grouped groups(int): The groups number of the Conv2d Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2, convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half the first half of the filters is only connected to the first half
...@@ -1193,6 +1203,8 @@ def conv2d(input, ...@@ -1193,6 +1203,8 @@ def conv2d(input,
use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
act(str): Activation type. Default: None act(str): Activation type. Default: None
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns: Returns:
Variable: The tensor variable storing the convolution and \ Variable: The tensor variable storing the convolution and \
...@@ -1233,6 +1245,7 @@ def conv2d(input, ...@@ -1233,6 +1245,7 @@ def conv2d(input,
filter_size = utils.convert_to_list(filter_size, 2, 'filter_size') filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
stride = utils.convert_to_list(stride, 2, 'stride') stride = utils.convert_to_list(stride, 2, 'stride')
padding = utils.convert_to_list(padding, 2, 'padding') padding = utils.convert_to_list(padding, 2, 'padding')
dilation = utils.convert_to_list(dilation, 2, 'dilation')
if not isinstance(use_cudnn, bool): if not isinstance(use_cudnn, bool):
raise ValueError("use_cudnn should be True or False") raise ValueError("use_cudnn should be True or False")
...@@ -1262,6 +1275,7 @@ def conv2d(input, ...@@ -1262,6 +1275,7 @@ def conv2d(input,
attrs={ attrs={
'strides': stride, 'strides': stride,
'paddings': padding, 'paddings': padding,
'dilations': dilation,
'groups': groups, 'groups': groups,
'use_cudnn': use_cudnn, 'use_cudnn': use_cudnn,
'use_mkldnn': use_mkldnn 'use_mkldnn': use_mkldnn
...@@ -1670,7 +1684,9 @@ def conv2d_transpose(input, ...@@ -1670,7 +1684,9 @@ def conv2d_transpose(input,
stride=1, stride=1,
dilation=1, dilation=1,
param_attr=None, param_attr=None,
bias_attr=None,
use_cudnn=True, use_cudnn=True,
act=None,
name=None): name=None):
""" """
**Convlution2D transpose layer** **Convlution2D transpose layer**
...@@ -1739,8 +1755,10 @@ def conv2d_transpose(input, ...@@ -1739,8 +1755,10 @@ def conv2d_transpose(input,
dilation_H = dilation_W = dilation. Default: dilation = 1. dilation_H = dilation_W = dilation. Default: dilation = 1.
param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
Default: None Default: None
bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
act(str): Activation type. Default: None
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
...@@ -1793,12 +1811,12 @@ def conv2d_transpose(input, ...@@ -1793,12 +1811,12 @@ def conv2d_transpose(input,
img_filter = helper.create_parameter( img_filter = helper.create_parameter(
dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
out = helper.create_tmp_variable(dtype=input.dtype) pre_bias = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op( helper.append_op(
type='conv2d_transpose', type='conv2d_transpose',
inputs={'Input': [input], inputs={'Input': [input],
'Filter': [img_filter]}, 'Filter': [img_filter]},
outputs={'Output': out}, outputs={'Output': pre_bias},
attrs={ attrs={
'strides': stride, 'strides': stride,
'paddings': padding, 'paddings': padding,
...@@ -1806,55 +1824,57 @@ def conv2d_transpose(input, ...@@ -1806,55 +1824,57 @@ def conv2d_transpose(input,
'use_cudnn': use_cudnn 'use_cudnn': use_cudnn
}) })
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
out = helper.append_activation(pre_act)
return out return out
def sequence_expand(x, y, name=None): def sequence_expand(x, y, ref_level=-1, name=None):
"""Sequence Expand Layer. This layer will expand the input variable **x** """Sequence Expand Layer. This layer will expand the input variable **x**
according to LoD information of **y**. And the following examples will according to specified level lod of **y**. Please note that lod level of
explain how sequence_expand works: **x** is at most 1 and rank of **x** is at least 2. When rank of **x**
is greater than 2, then it would be viewed as a 2-D tensor.
Following examples will explain how sequence_expand works:
.. code-block:: text .. code-block:: text
* Case 1 * Case 1
x is a LoDTensor: x is a LoDTensor:
x.lod = [[0, 2, 3], x.lod = [[0, 2, 4]]
[0, 1, 3, 4]] x.data = [[a], [b], [c], [d]]
x.data = [a, b, c, d]
x.dims = [4, 1] x.dims = [4, 1]
y is a LoDTensor: y is a LoDTensor:
y.lod = [[0, 2, 4], y.lod = [[0, 2, 4],
[0, 3, 6, 7, 8]] [0, 3, 6, 7, 8]]
with condition len(y.lod[-1]) - 1 == x.dims[0] ref_level: 0
then output is a 2-level LoDTensor: then output is a 1-level LoDTensor:
out.lod = [[0, 2, 4], out.lod = [[0, 2, 4, 6, 8]]
[0, 3, 6, 7, 8]] out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
out.data = [a, a, a, b, b, b, c, d]
out.dims = [8, 1] out.dims = [8, 1]
* Case 2 * Case 2
x is a Tensor: x is a Tensor:
x.data = [a, b, c] x.data = [[a], [b], [c]]
x.dims = [3, 1] x.dims = [3, 1]
y is a LoDTensor: y is a LoDTensor:
y.lod = [[0, 2, 3, 6]] y.lod = [[0, 2, 2, 5]]
with condition len(y.lod[-1]) - 1 == x.dims[0]
then output is a 1-level LoDTensor: ref_level: -1
out.lod = [[0, 2, 3, 6]]
out.data = [a, a, b, c, c, c]
out.dims = [6, 1]
then output is a Tensor:
out.data = [[a], [a], [c], [c], [c]]
out.dims = [5, 1]
Args: Args:
x (Variable): The input variable which is a Tensor or LoDTensor. x (Variable): The input variable which is a Tensor or LoDTensor.
y (Variable): The input variable which is a LoDTensor. y (Variable): The input variable which is a LoDTensor.
ref_level (int): Lod level of `y` to be referred by `x`. If set to -1,
refer the last level of lod.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
Returns: Returns:
Variable: The expanded variable which is a LoDTensor. Variable: The expanded variable which is a LoDTensor.
...@@ -1865,14 +1885,17 @@ def sequence_expand(x, y, name=None): ...@@ -1865,14 +1885,17 @@ def sequence_expand(x, y, name=None):
x = fluid.layers.data(name='x', shape=[10], dtype='float32') x = fluid.layers.data(name='x', shape=[10], dtype='float32')
y = fluid.layers.data(name='y', shape=[10, 20], y = fluid.layers.data(name='y', shape=[10, 20],
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
out = layers.sequence_expand(x=x, y=y) out = layers.sequence_expand(x=x, y=y, ref_level=0)
""" """
helper = LayerHelper('sequence_expand', input=x, **locals()) helper = LayerHelper('sequence_expand', input=x, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type='sequence_expand', inputs={'X': x, type='sequence_expand',
'Y': y}, outputs={'Out': tmp}) inputs={'X': x,
'Y': y},
outputs={'Out': tmp},
attrs={'ref_level': ref_level})
return tmp return tmp
...@@ -3388,3 +3411,73 @@ def lod_reset(x, y=None, target_lod=None): ...@@ -3388,3 +3411,73 @@ def lod_reset(x, y=None, target_lod=None):
raise ValueError("y and target_lod should not be both None.") raise ValueError("y and target_lod should not be both None.")
return out return out
def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
"""
Local Response Normalization Layer. This layer performs a type of
"lateral inhibition" by normalizing over local input regions.
The formula is as follows:
.. math::
Output(i, x, y) = Input(i, x, y) / \left(
k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
(Input(j, x, y))^2 \right)^{\beta}
In the above equation:
* :math:`n`: The number of channels to sum over.
* :math:`k`: The offset (avoid being divided by 0).
* :math:`alpha`: The scaling parameter.
* :math:`beta`: The exponent parameter.
Refer to `ImageNet Classification with Deep Convolutional Neural Networks
<https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
Args:
input (Variable): The input tensor of this layer, and the dimension of input tensor must be 4.
n (int, default 5): The number of channels to sum over.
k (float, default 1.0): An offset (usually positive to avoid dividing by 0).
alpha (float, default 1e-4): The scaling parameter.
beta (float, default 0.75): The exponent.
name (str, default None): A name for this operation.
Raises:
ValueError: If rank of the input tensor is not 4.
Returns:
A tensor variable storing the transformation result.
Examples:
.. code-block:: python
data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32")
lrn = fluid.layers.lrn(input=data)
"""
helper = LayerHelper('lrn', **locals())
dtype = helper.input_dtype()
input_shape = input.shape
dims = len(input_shape)
if dims != 4:
raise ValueError(
"dims of input must be 4(not %d), and it's order must be NCHW" %
(dims))
mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
lrn_out = helper.create_tmp_variable(dtype)
helper.append_op(
type="lrn",
inputs={"X": input},
outputs={
"Out": lrn_out,
"MidOut": mid_out,
},
attrs={"n": n,
"k": k,
"alpha": alpha,
"beta": beta})
return lrn_out
...@@ -69,6 +69,7 @@ __all__ = [ ...@@ -69,6 +69,7 @@ __all__ = [
'gaussian_random_batch_size_like', 'gaussian_random_batch_size_like',
'cumsum', 'cumsum',
'scatter', 'scatter',
'sum',
] + __activations__ ] + __activations__
for _OP in set(__all__): for _OP in set(__all__):
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from collections import defaultdict from collections import defaultdict
from paddle.fluid.framework import Program
import framework import framework
import layers import layers
from backward import append_backward from backward import append_backward
...@@ -23,8 +23,12 @@ from initializer import Constant ...@@ -23,8 +23,12 @@ from initializer import Constant
from layer_helper import LayerHelper from layer_helper import LayerHelper
from regularizer import append_regularization_ops from regularizer import append_regularization_ops
from clip import append_gradient_clip_ops, error_clip_callback from clip import append_gradient_clip_ops, error_clip_callback
from contextlib import contextmanager
__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad'] __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
'Adadelta', 'ModelAverage'
]
class Optimizer(object): class Optimizer(object):
...@@ -119,7 +123,12 @@ class Optimizer(object): ...@@ -119,7 +123,12 @@ class Optimizer(object):
""" """
pass pass
def _add_accumulator(self, name, param, dtype=None, fill_value=0.0): def _add_accumulator(self,
name,
param,
dtype=None,
fill_value=0.0,
shape=None):
"""Utility function to add an accumulator for a parameter """Utility function to add an accumulator for a parameter
Args: Args:
...@@ -133,17 +142,19 @@ class Optimizer(object): ...@@ -133,17 +142,19 @@ class Optimizer(object):
param.name in self._accumulators[name]): param.name in self._accumulators[name]):
raise Exception("Accumulator {} already exists for parameter {}". raise Exception("Accumulator {} already exists for parameter {}".
format(name, param.name)) format(name, param.name))
if shape == None:
shape = param.shape
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
var = self.helper.create_global_variable( var = self.helper.create_global_variable(
name=unique_name.generate(name), name=unique_name.generate(name),
persistable=True, persistable=True,
dtype=dtype or param.dtype, dtype=dtype or param.dtype,
type=param.type, type=param.type,
shape=param.shape) shape=shape)
self.helper.set_variable_initializer( self.helper.set_variable_initializer(
var, initializer=Constant(value=float(fill_value))) var, initializer=Constant(value=float(fill_value)))
self._accumulators[name][param.name] = var self._accumulators[name][param.name] = var
return var
def _get_accumulator(self, name, param): def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter """Utility function to fetch an accumulator for a parameter
...@@ -580,6 +591,205 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -580,6 +591,205 @@ class DecayedAdagradOptimizer(Optimizer):
return decayed_adagrad_op return decayed_adagrad_op
class AdadeltaOptimizer(Optimizer):
"""
**Adadelta Optimizer**
Simple Adadelta optimizer with average squared grad state and
average squared update state.
The details of adadelta please refer to this
`ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
<http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
.. math::
E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
E(g_t^2) + \\epsilon ) ) \\\\
E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
Args:
learning_rate(float): global leraning rate
rho(float): rho in equation
epsilon(float): epsilon in equation
Examples:
.. code-block:: python
optimizer = fluid.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
_, params_grads = optimizer.minimize(cost)
"""
_avg_squared_grad_acc_str = "_avg_squared_grad"
_avg_squared_update_acc_str = "_avg_squared_update"
def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs):
if learning_rate is None:
raise ValueError("learning_rate is not set.")
if epsilon is None:
raise ValueError("epsilon is not set.")
if rho is None:
raise ValueError("rho is not set.")
super(AdadeltaOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "adadelta"
self._epsilon = epsilon
self._rho = rho
def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
for p in parameters:
self._add_accumulator(self._avg_squared_grad_acc_str, p)
self._add_accumulator(self._avg_squared_update_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
avg_squared_grad_acc = self._get_accumulator(
self._avg_squared_grad_acc_str, param_and_grad[0])
avg_squared_update_acc = self._get_accumulator(
self._avg_squared_update_acc_str, param_and_grad[0])
# Create the adadelta optimizer op
adadelta_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc
},
outputs={
"ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc
},
attrs={"epsilon": self._epsilon,
"rho": self._rho})
return adadelta_op
class RMSPropOptimizer(Optimizer):
"""
Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
rate method. The original slides proposed RMSProp: Slide 29 of
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
The original equation is as follows:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
The first equation calculates moving average of the squared gradient for
each weight. Then dividing the gradient by :math: `sqrt{v(w,t)}`.
In some cases, adding a momentum term :math: `\\beta` is beneficial.
In our implementation, Nesterov momentum is used:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
\\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t)
where, :math: `\\rho` is a hyperparameter and typical values are 0.9, 0.95
and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
smoothing term to avoid division by zero, usually set somewhere in range
from 1e-4 to 1e-8.
Args:
learning_rate(float): global leraning rate.
rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
epsilon(float): :math: `\\epsilon` in equation is smoothing term to
avoid division by zero, set 1e-6 by default.
momentum(float): :math: `\\beta` in equation is the momentum term,
set 0.0 by default.
Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None.
Examples:
.. code-block:: python
optimizer = fluid.optimizer.RMSProp(0.0001)
_, params_grads = optimizer.minimize(cost)
"""
_momentum_acc_str = "momentum"
_mean_square_acc_str = "mean_square"
def __init__(self,
learning_rate,
rho=0.95,
epsilon=1.0e-6,
momentum=0.0,
**kwargs):
super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
if learning_rate is None:
raise ValueError("learning_rate is not set.")
if rho is None:
raise ValueError("rho is not set.")
if epsilon is None:
raise ValueError("epsilon is not set.")
if momentum is None:
raise ValueError("momentum is not set.")
self.type = "rmsprop"
self._rho = rho
self._epsilon = epsilon
self._momentum = momentum
def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
for p in parameters:
self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
momentum_acc = self._get_accumulator(self._momentum_acc_str,
param_and_grad[0])
mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
param_and_grad[0])
rmsprop_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc
},
attrs={
"epsilon": self._epsilon,
"decay": self._rho,
"momentum": self._momentum
})
return rmsprop_op
# We short the class name, since users will use the optimizer with the package # We short the class name, since users will use the optimizer with the package
# name. The sample code: # name. The sample code:
# #
...@@ -594,3 +804,145 @@ Adagrad = AdagradOptimizer ...@@ -594,3 +804,145 @@ Adagrad = AdagradOptimizer
Adam = AdamOptimizer Adam = AdamOptimizer
Adamax = AdamaxOptimizer Adamax = AdamaxOptimizer
DecayedAdagrad = DecayedAdagradOptimizer DecayedAdagrad = DecayedAdagradOptimizer
Adadelta = AdadeltaOptimizer
RMSProp = RMSPropOptimizer
class ModelAverage(Optimizer):
"""Accumulate the average of parameters whtin sliding window. The average
result will be saved in temporary variables which can be applied to
parameter variables of current model by calling 'apply()' method. And the
'restore()' method is used to restored the parameter values of current model.
The size of average window is determined by average_window_rate,
min_average_window, max_average_window and current update times.
Args:
params_grads: A list of parameter-grad variable pairs.
average_window_rate: The rate of average window.
min_average_window: The minimum size of average window.
max_average_window: The maximum size of average window.
Examples:
...
optimizer = fluid.optimizer.Momentum()
_, params_grads = optimizer.minimize(cost)
model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
min_average_window=10000,
max_average_window=20000)
for pass_id in range(args.pass_num):
for data in train_reader():
exe.run(fluid.default_main_program()...)
with model_average.apply(exe):
for data in test_reader():
exe.run(inference_program...)
"""
def __init__(self,
params_grads,
average_window_rate,
min_average_window=10000,
max_average_window=10000,
**kwargs):
super(ModelAverage, self).__init__(0.0, **kwargs)
self.average_window = average_window_rate
self.min_average_window = min_average_window
self.max_average_window = max_average_window
self.params_grads = params_grads
for param, grad in self.params_grads:
if grad is not None:
self._append_average_accumulate_op(param)
self.apply_program = Program()
block = self.apply_program.global_block()
with program_guard(main_program=self.apply_program):
for param_grad in self.params_grads:
if param_grad[1] is not None:
self._add_average_apply_op(block, param_grad)
self.restore_program = Program()
block = self.restore_program.global_block()
with program_guard(main_program=self.restore_program):
for param_grad in self.params_grads:
if param_grad[1] is not None:
self._add_average_restore_op(block, param_grad)
def _add_average_apply_op(self, block, param_grad):
param = block.clone_variable(param_grad[0])
grad = block.clone_variable(param_grad[1])
sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
num_accumulates = block.clone_variable(
self._get_accumulator('num_accumulates', param))
old_num_accumulates = block.clone_variable(
self._get_accumulator('old_num_accumulates', param))
num_updates = block.clone_variable(
self._get_accumulator('num_updates', param))
# backup param value to grad
layers.assign(input=param, output=grad)
# param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
sum = layers.sum(x=[sum_1, sum_2, sum_3])
tmp = layers.cast(x=tmp, dtype='float32')
sum = layers.cast(x=sum, dtype='float32')
layers.elementwise_div(x=sum, y=tmp, out=param)
def _add_average_restore_op(self, block, param_grad):
param = block.clone_variable(param_grad[0])
grad = block.clone_variable(param_grad[1])
layers.assign(input=grad, output=param)
def _append_average_accumulate_op(self, param):
self.helper = LayerHelper("average_accumulate")
sum_1 = self._add_accumulator('sum_1', param)
sum_2 = self._add_accumulator('sum_2', param)
sum_3 = self._add_accumulator('sum_3', param)
num_accumulates = self._add_accumulator(
'num_accumulates', param, dtype='int64', shape=[1])
old_num_accumulates = self._add_accumulator(
'old_num_accumulates', param, dtype='int64', shape=[1])
num_updates = self._add_accumulator(
'num_updates', param, dtype='int64', shape=[1])
self.helper.append_op(
type='average_accumulates',
inputs={
"param": param,
"in_sum_1": sum_1,
"in_sum_2": sum_2,
"in_sum_3": sum_3,
"in_num_accumulates": num_accumulates,
"in_old_num_accumulates": old_num_accumulates,
"in_num_updates": num_updates
},
outputs={
"out_sum_1": sum_1,
"out_sum_2": sum_2,
"out_sum_3": sum_3,
"out_num_accumulates": num_accumulates,
"out_old_num_accumulates": old_num_accumulates,
"out_num_updates": num_updates,
},
attrs={
"average_window": self.average_window,
"min_average_window": self.min_average_window,
"max_average_window": self.max_average_window,
})
@contextmanager
def apply(self, executor, need_restore=True):
"""Apply average values to parameters of current model.
"""
executor.run(self.apply_program)
try:
yield
finally:
if need_restore:
self.restore(executor)
def restore(self, executor):
"""Restore parameter values of current model.
"""
executor.run(self.restore_program)
...@@ -118,12 +118,12 @@ def decoder_decode(context, is_sparse): ...@@ -118,12 +118,12 @@ def decoder_decode(context, is_sparse):
is_sparse=is_sparse) is_sparse=is_sparse)
# use rnn unit to update rnn # use rnn unit to update rnn
current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded], current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
size=decoder_size, size=decoder_size,
act='tanh') act='tanh')
current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
# use score to do beam search # use score to do beam search
current_score = pd.fc(input=current_state, current_score = pd.fc(input=current_state_with_lod,
size=target_dict_dim, size=target_dict_dim,
act='softmax') act='softmax')
topk_scores, topk_indices = pd.topk(current_score, k=50) topk_scores, topk_indices = pd.topk(current_score, k=50)
......
...@@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase): ...@@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase):
with while_op.block(): with while_op.block():
result2 = fill_constant( result2 = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
x_to_send_tmp = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
# TODO(abhinav): Need to perform copy when doing a channel send.
# Once this is complete, we can remove these lines
assign(input=x, output=x_to_send_tmp)
with fluid.Select() as select: with fluid.Select() as select:
with select.case(fluid.channel_send, channel, with select.case(
x_to_send_tmp): fluid.channel_send, channel, x, is_copy=True):
assign(input=x, output=x_tmp) assign(input=x, output=x_tmp)
assign(input=y, output=x) assign(input=y, output=x)
assign(elementwise_add(x=x_tmp, y=y), output=y) assign(elementwise_add(x=x_tmp, y=y), output=y)
...@@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase): ...@@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase):
core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64) core.VarDesc.VarType.FP64)
pong_result = self._create_tensor('pong_return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
def ping(ch, message): def ping(ch, message):
message_to_send_tmp = fill_constant( fluid.channel_send(ch, message, is_copy=True)
shape=[1], dtype=core.VarDesc.VarType.FP64, value=0)
assign(input=message, output=message_to_send_tmp)
fluid.channel_send(ch, message_to_send_tmp)
def pong(ch1, ch2): def pong(ch1, ch2):
fluid.channel_recv(ch1, ping_result) fluid.channel_recv(ch1, ping_result)
assign(input=ping_result, output=pong_result) fluid.channel_send(ch2, ping_result, is_copy=True)
fluid.channel_send(ch2, pong_result)
pings = fluid.make_channel( pings = fluid.make_channel(
dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
......
mnist.recordio mnist.recordio
mnist_0.recordio
mnist_1.recordio
mnist_2.recordio
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
from scipy.special import expit from scipy.special import expit
...@@ -212,18 +213,39 @@ class TestRound(OpTest): ...@@ -212,18 +213,39 @@ class TestRound(OpTest):
class TestRelu(OpTest): class TestRelu(OpTest):
def setUp(self): def setUp(self):
self.op_type = "relu" self.op_type = "relu"
x = np.random.uniform(-1, 1, [11, 17]).astype("float32") self.dtype = np.float32
self.init_dtype()
x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
# The same reason with TestAbs # The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02 x[np.abs(x) < 0.005] = 0.02
self.inputs = {'X': x} out = np.maximum(x, 0)
self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Relu(TestRelu):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestBRelu(OpTest): class TestBRelu(OpTest):
def setUp(self): def setUp(self):
...@@ -484,5 +506,54 @@ class TestSwish(OpTest): ...@@ -484,5 +506,54 @@ class TestSwish(OpTest):
self.check_grad(['X'], 'Out', max_relative_error=0.008) self.check_grad(['X'], 'Out', max_relative_error=0.008)
#--------------------test MKLDNN--------------------
class TestMKLDNNRelu(TestRelu):
def setUp(self):
super(TestMKLDNNRelu, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
out = np.maximum(x, 0)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanh(TestTanh):
def setUp(self):
super(TestMKLDNNTanh, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.tanh(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrt(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrt, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.sqrt(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbs(TestAbs):
def setUp(self):
super(TestMKLDNNAbs, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
self.inputs = {'X': x}
self.outputs = {'Out': np.abs(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -31,6 +31,37 @@ def get_backward_op(scope, op, no_grad_set): ...@@ -31,6 +31,37 @@ def get_backward_op(scope, op, no_grad_set):
return backward_op return backward_op
def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
x_shape = x.shape
if len(x_shape) == 2:
if data_format == "NCHW":
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
else:
x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
if data_format == "NCHW":
n, c, h, w = x.shape
mean_tile = np.reshape(mean, (1, c, 1, 1))
mean_tile = np.tile(mean_tile, (n, 1, h, w))
var_tile = np.reshape(var, (1, c, 1, 1))
var_tile = np.tile(var_tile, (n, 1, h, w))
normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon)
scale_tile = np.reshape(scale, (1, c, 1, 1))
scale_tile = np.tile(scale_tile, (n, 1, h, w))
offset_tile = np.reshape(offset, (1, c, 1, 1))
offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
y = normalized * scale_tile + offset_tile
elif data_format == "NHWC":
normalized = (x - mean) / np.sqrt(var + epsilon)
y = normalized * scale + offset
else:
raise ValueError("Unknown data order.")
if len(x_shape) == 2:
y = np.reshape(y, x_shape)
return y
def _reference_training(x, scale, offset, epsilon, data_format): def _reference_training(x, scale, offset, epsilon, data_format):
x_shape = x.shape x_shape = x.shape
if len(x_shape) == 2: if len(x_shape) == 2:
...@@ -155,11 +186,159 @@ def set_output_grad(scope, outputs, place, feed_dict=None): ...@@ -155,11 +186,159 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
__set_tensor__(output, data) __set_tensor__(output, data)
class TestBatchNormOp(OpTest): class TestBatchNormOpInference(OpTest):
def setUp(self):
self.dtype = np.float32
def __assert_close(self, tensor, np_array, msg, atol=1e-4): def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def test_python(self): def check_with_place(self, place, data_layout, dtype, shape):
epsilon = 0.00001
if len(shape) == 2:
x_shape = shape
c = x_shape[1]
else:
n, h, w, c = shape[0], shape[1], shape[2], shape[3]
if data_layout == "NHWC":
x_shape = [n, h, w, c]
elif data_layout == "NCHW":
x_shape = [n, c, h, w]
else:
raise ValueError("Unknown data layout.")
scale_shape = [c]
x_val = np.random.random_sample(x_shape).astype(dtype)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32)
y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
epsilon, data_layout).astype(dtype)
scope = core.Scope()
# create input
x_tensor = create_or_get_tensor(scope, "x_val",
OpTest.np_dtype_to_fluid_dtype(x_val),
place)
scale_tensor = create_or_get_tensor(
scope, "scale_val",
OpTest.np_dtype_to_fluid_dtype(scale_val), place)
bias_tensor = create_or_get_tensor(
scope, "bias_val", OpTest.np_dtype_to_fluid_dtype(bias_val), place)
mean_tensor = create_or_get_tensor(scope, "mean",
OpTest.np_dtype_to_fluid_dtype(mean),
place)
variance_tensor = create_or_get_tensor(
scope, "variance", OpTest.np_dtype_to_fluid_dtype(variance), place)
# create output
y_tensor = create_or_get_tensor(scope, "y_out", None, place)
saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
place)
saved_variance_tensor = create_or_get_tensor(scope, "saved_variance",
None, place)
mean_out_tensor = mean_tensor
variance_out_tensor = variance_tensor
batch_norm_op = Operator(
"batch_norm",
# inputs
X="x_val",
Scale="scale_val",
Bias="bias_val",
Mean="mean",
Variance="variance",
# outputs
Y="y_out",
MeanOut="mean",
VarianceOut="variance",
SavedMean="saved_mean",
SavedVariance="saved_variance",
# attrs
is_test=True,
data_layout=data_layout,
epsilon=epsilon)
batch_norm_op.run(scope, place)
# check inference result
self.__assert_close(
y_tensor,
y_out,
"inference output are different at " + str(place) + ", " +
data_layout + ", " + str(np.dtype(dtype)) +
str(np.array(y_tensor)) + str(y_out),
atol=1e-3)
def test_check_output(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
places.append(core.CUDAPlace(0))
for place in places:
for data_format in ["NCHW", "NHWC"]:
self.check_with_place(place, data_format, self.dtype,
[2, 3, 4, 5])
self.check_with_place(place, data_format, self.dtype, [2, 3])
class TestFP16BatchNormOpInference(TestBatchNormOpInference):
def setUp(self):
self.dtype = np.float16
def test_check_output(self):
places = []
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
places.append(place)
for place in places:
for data_format in ["NCHW", "NHWC"]:
self.check_with_place(place, data_format, self.dtype,
[2, 3, 4, 5])
self.check_with_place(place, data_format, self.dtype, [2, 3])
class TestBatchNormOpTraining(OpTest):
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def test_python_testing(self):
data_format = "NHWC"
epsilon = 0.00001
n, h, w, c = 2, 3, 4, 5
x_shape = [n, h, w, c]
scale_shape = [c]
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32)
y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
epsilon, "NHWC")
# running N, C, H, W case
# should produce the same results
x_shape2 = [n, c, h, w]
x_val2 = np.transpose(x_val, (0, 3, 1, 2))
y_out2 = _reference_testing(x_val2, scale_val, bias_val, mean, variance,
epsilon, "NCHW")
# transfer (N, C, H, W) back to (N, H, W, C)
y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
self.__assert_close(y_out, y_out2_trans, "inference output")
print 'python: NHWC, NCHW, inference checking passed'
def test_python_training(self):
data_format = "NHWC" data_format = "NHWC"
epsilon = 0.00001 epsilon = 0.00001
momentum = 0.9 momentum = 0.9
...@@ -197,7 +376,7 @@ class TestBatchNormOp(OpTest): ...@@ -197,7 +376,7 @@ class TestBatchNormOp(OpTest):
# transfer (N, C, H, W) back to (N, H, W, C) # transfer (N, C, H, W) back to (N, H, W, C)
y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1)) y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
self.__assert_close(y_out, y_out2_trans, "batch variance") self.__assert_close(y_out, y_out2_trans, "batch output")
print 'python: NHWC, NCHW, forward checking passed' print 'python: NHWC, NCHW, forward checking passed'
# test backward now # test backward now
......
...@@ -20,19 +20,35 @@ from op_test import OpTest ...@@ -20,19 +20,35 @@ from op_test import OpTest
class TestConcatOp(OpTest): class TestConcatOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "concat" self.op_type = "concat"
x0 = np.random.random((2, 1, 4, 5)).astype('float32') self.init_test_data()
x1 = np.random.random((2, 2, 4, 5)).astype('float32') self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
x2 = np.random.random((2, 3, 4, 5)).astype('float32') self.attrs = {'axis': self.axis}
axis = 1 self.outputs = {
self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]} 'Out': np.concatenate(
self.attrs = {'axis': axis} (self.x0, self.x1, self.x2), axis=self.axis)
self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)} }
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['x0'], 'Out') self.check_grad(['x0'], 'Out')
self.check_grad(['x1'], 'Out')
self.check_grad(['x2'], 'Out')
def init_test_data(self):
self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
self.axis = 1
class TestConcatOp2(OpTest):
def init_test_data(self):
self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
self.axis = 1
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
...@@ -82,5 +83,37 @@ class TestDropoutOp5(OpTest): ...@@ -82,5 +83,37 @@ class TestDropoutOp5(OpTest):
self.check_output() self.check_output()
class TestFP16DropoutOp(OpTest):
def setUp(self):
self.op_type = "dropout"
self.init_test_case()
x = np.random.random(self.input_size).astype("float16")
out = x * (1.0 - self.prob)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {
'dropout_prob': self.prob,
'fix_seed': self.fix_seed,
'is_test': True
}
self.outputs = {'Out': out}
def init_test_case(self):
self.input_size = [32, 64]
self.prob = 0.35
self.fix_seed = True
def test_check_output(self):
if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"):
self.check_output_with_place(core.CUDAPlace(0), atol=1e-3)
class TestFP16DropoutOp2(TestFP16DropoutOp):
def init_test_case(self):
self.input_size = [32, 64, 3]
self.prob = 0.75
self.fix_seed = False
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -13,158 +13,243 @@ ...@@ -13,158 +13,243 @@
# limitations under the License. # limitations under the License.
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
class TestElementwiseOp(OpTest): class TestElementwiseAddOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "elementwise_add" self.op_type = "elementwise_add"
self.dtype = np.float32
self.axis = -1
self.init_dtype()
self.init_input_output()
self.init_axis()
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), 'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") 'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
} }
self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])} self.attrs = {'axis': self.axis}
self.outputs = {'Out': self.out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad_normal(self): def test_check_grad_normal(self):
if self.dtype == np.float16:
return
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005) self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
def test_check_grad_ingore_x(self): def test_check_grad_ingore_x(self):
if self.dtype == np.float16:
return
self.check_grad( self.check_grad(
['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X")) ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
def test_check_grad_ingore_y(self): def test_check_grad_ingore_y(self):
if self.dtype == np.float16:
return
self.check_grad( self.check_grad(
['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y')) ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.out = np.add(self.x, self.y)
class TestElementwiseAddOp_scalar(TestElementwiseOp): def init_dtype(self):
def setUp(self): pass
self.op_type = "elementwise_add"
self.inputs = {
'X': np.random.rand(2, 3, 4).astype(np.float32),
'Y': np.random.rand(1).astype(np.float32)
}
self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
def init_axis(self):
pass
class TestElementwiseAddOp_scalar2(TestElementwiseOp):
def setUp(self):
self.op_type = "elementwise_add"
self.inputs = {
'X': np.random.rand(2, 3, 4).astype(np.float32),
'Y': np.random.rand(1, 1).astype(np.float32)
}
self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
def init_dtype(self):
self.dtype = np.float16
class TestElementwiseAddOp_Vector(TestElementwiseOp): def test_check_output(self):
def setUp(self): if core.is_compiled_with_cuda():
self.op_type = "elementwise_add" place = core.CUDAPlace(0)
self.inputs = { if core.is_float16_supported(place):
'X': np.random.random((32, )).astype("float32"), self.check_output_with_place(place, atol=1e-3)
'Y': np.random.random((32, )).astype("float32")
}
self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
class TestElementwiseAddOp_broadcast_0(TestElementwiseOp): class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
def setUp(self): def init_input_output(self):
self.op_type = "elementwise_add" self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.inputs = { self.y = np.random.rand(1).astype(self.dtype)
'X': np.random.rand(2, 3, 4).astype(np.float32), self.out = self.x + self.y
'Y': np.random.rand(2).astype(np.float32)
}
self.attrs = {'axis': 0}
self.outputs = {
'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1)
}
class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.y = np.random.rand(1).astype(self.dtype)
self.out = self.x + self.y
class TestElementwiseAddOp_broadcast_1(TestElementwiseOp):
def setUp(self):
self.op_type = "elementwise_add"
self.inputs = {
'X': np.random.rand(2, 3, 4).astype(np.float32),
'Y': np.random.rand(3).astype(np.float32)
}
self.attrs = {'axis': 1} class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
self.outputs = { def init_input_output(self):
'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 1) self.x = np.random.rand(2, 3, 4).astype(self.dtype)
} self.y = np.random.rand(1, 1).astype(self.dtype)
self.out = self.x + self.y
class TestElementwiseAddOp_broadcast_2(TestElementwiseOp): class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp):
def setUp(self): def init_input_output(self):
self.op_type = "elementwise_add" self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.inputs = { self.y = np.random.rand(1, 1).astype(self.dtype)
'X': np.random.rand(2, 3, 4).astype(np.float32), self.out = self.x + self.y
'Y': np.random.rand(4).astype(np.float32)
}
self.outputs = {
'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1, 4)
}
class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.random((32, )).astype(self.dtype)
self.y = np.random.random((32, )).astype(self.dtype)
self.out = np.add(self.x, self.y)
class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
def setUp(self):
self.op_type = "elementwise_add"
self.inputs = {
'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
'Y': np.random.rand(3, 4).astype(np.float32)
}
self.attrs = {'axis': 1} class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp):
self.outputs = { def init_input_output(self):
'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4, 1) self.x = np.random.random((32, )).astype(self.dtype)
} self.y = np.random.random((32, )).astype(self.dtype)
self.out = np.add(self.x, self.y)
class TestElementwiseAddOp_broadcast_4(TestElementwiseOp): class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
def setUp(self): def init_input_output(self):
self.op_type = "elementwise_add" self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.inputs = { self.y = np.random.rand(2).astype(self.dtype)
'X': np.random.rand(2, 3, 4, 5).astype(np.float32), self.out = self.x + self.y.reshape(2, 1, 1)
'Y': np.random.rand(2, 1).astype(np.float32)
}
self.attrs = {'axis': 0} def init_axis(self):
self.outputs = { self.axis = 0
'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1, 1)
}
class TestElementwiseAddOp_rowwise_add_0(TestElementwiseOp): class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
def setUp(self): def init_input_output(self):
self.op_type = "elementwise_add" self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.inputs = { self.y = np.random.rand(2).astype(self.dtype)
'X': np.random.rand(2, 3, 4).astype(np.float32), self.out = self.x + self.y.reshape(2, 1, 1)
'Y': np.random.rand(3, 4).astype(np.float32)
}
self.attrs = {'axis': 1} def init_axis(self):
self.outputs = { self.axis = 0
'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4)
}
class TestElementwiseAddOp_rowwise_add_1(TestElementwiseOp): class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
def setUp(self): def init_input_output(self):
self.op_type = "elementwise_add" self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.inputs = { self.y = np.random.rand(3).astype(self.dtype)
'X': np.random.rand(2, 1).astype(np.float32), self.out = self.x + self.y.reshape(1, 3, 1)
'Y': np.random.rand(1).astype(np.float32)
}
self.attrs = {'axis': 1} def init_axis(self):
self.outputs = { self.axis = 1
'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1)
}
class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.y = np.random.rand(3).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 3, 1)
def init_axis(self):
self.axis = 1
class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.y = np.random.rand(4).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 1, 4)
class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.y = np.random.rand(4).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 1, 4)
class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
self.y = np.random.rand(3, 4).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 3, 4, 1)
def init_axis(self):
self.axis = 1
class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
self.y = np.random.rand(3, 4).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 3, 4, 1)
def init_axis(self):
self.axis = 1
class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
self.y = np.random.rand(2, 1).astype(self.dtype)
self.out = self.x + self.y.reshape(2, 1, 1, 1)
def init_axis(self):
self.axis = 0
class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
self.y = np.random.rand(2, 1).astype(self.dtype)
self.out = self.x + self.y.reshape(2, 1, 1, 1)
def init_axis(self):
self.axis = 0
class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.y = np.random.rand(3, 4).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 3, 4)
def init_axis(self):
self.axis = 1
class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4).astype(self.dtype)
self.y = np.random.rand(3, 4).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 3, 4)
def init_axis(self):
self.axis = 1
class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 1).astype(self.dtype)
self.y = np.random.rand(1).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 1)
def init_axis(self):
self.axis = 1
class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 1).astype(self.dtype)
self.y = np.random.rand(1).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 1)
def init_axis(self):
self.axis = 1
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -16,7 +16,6 @@ import unittest ...@@ -16,7 +16,6 @@ import unittest
import numpy import numpy
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.layers import mul, data from paddle.fluid.layers import mul, data
......
...@@ -181,8 +181,8 @@ class TestBook(unittest.TestCase): ...@@ -181,8 +181,8 @@ class TestBook(unittest.TestCase):
with program_guard(program): with program_guard(program):
x = layers.data(name='x', shape=[10], dtype='float32') x = layers.data(name='x', shape=[10], dtype='float32')
y = layers.data( y = layers.data(
name='y', shape=[10, 20], dtype='float32', lod_level=1) name='y', shape=[10, 20], dtype='float32', lod_level=2)
self.assertIsNotNone(layers.sequence_expand(x=x, y=y)) self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1))
print(str(program)) print(str(program))
def test_lstm_unit(self): def test_lstm_unit(self):
...@@ -231,6 +231,13 @@ class TestBook(unittest.TestCase): ...@@ -231,6 +231,13 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(layers.softmax(hid)) self.assertIsNotNone(layers.softmax(hid))
print(str(program)) print(str(program))
def test_lrn(self):
program = Program()
with program_guard(program):
data = layers.data(name='data', shape=[6, 2, 2], dtype='float32')
self.assertIsNotNone(layers.lrn(data))
print(str(program))
def test_get_places(self): def test_get_places(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -87,5 +87,34 @@ class TestLRNOp(OpTest): ...@@ -87,5 +87,34 @@ class TestLRNOp(OpTest):
self.check_grad(['X'], 'Out', max_relative_error=0.01) self.check_grad(['X'], 'Out', max_relative_error=0.01)
class TestLRNMKLDNNOp(TestLRNOp):
def get_attrs(self):
attrs = TestLRNOp.get_attrs(self)
attrs['use_mkldnn'] = True
return attrs
def test_check_output(self):
self.check_output(atol=0.002)
class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
def get_attrs(self):
attrs = TestLRNMKLDNNOp.get_attrs(self)
attrs['is_test'] = True
return attrs
def test_check_grad_normal(self):
def check_raise_is_test():
try:
self.check_grad(['X'], 'Out', max_relative_error=0.01)
except Exception as e:
t = \
"is_test attribute should be set to False in training phase."
if t in str(e):
raise AttributeError
self.assertRaises(AttributeError, check_raise_is_test)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.v2 as paddle
import paddle.v2.dataset.mnist as mnist
class TestMultipleReader(unittest.TestCase):
def setUp(self):
self.batch_size = 64
self.pass_num = 3
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = paddle.batch(mnist.train(), batch_size=self.batch_size)
feeder = fluid.DataFeeder(
feed_list=[
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
self.num_batch = fluid.recordio_writer.convert_reader_to_recordio_file(
'./mnist.recordio', data_file, feeder)
def test_main(self):
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.open_recordio_file(
filename='./mnist.recordio',
shapes=[(-1, 784), (-1, 1)],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
data_file = fluid.layers.create_multi_pass_reader(
reader=data_file, pass_num=self.pass_num)
img, label = fluid.layers.read_file(data_file)
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
batch_count = 0
while not data_file.eof():
img_val, = exe.run(fetch_list=[img])
batch_count += 1
self.assertLessEqual(img_val.shape[0], self.batch_size)
data_file.reset()
self.assertEqual(batch_count, self.num_batch * self.pass_num)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.v2 as paddle
import paddle.v2.dataset.mnist as mnist
from shutil import copyfile
class TestMultipleReader(unittest.TestCase):
def setUp(self):
self.batch_size = 64
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=self.batch_size)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
self.num_batch = fluid.recordio_writer.convert_reader_to_recordio_file(
'./mnist_0.recordio', reader, feeder)
copyfile('./mnist_0.recordio', './mnist_1.recordio')
copyfile('./mnist_0.recordio', './mnist_2.recordio')
def main(self, thread_num):
file_list = [
'./mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio'
]
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_files = fluid.layers.open_files(
filenames=file_list,
thread_num=thread_num,
shapes=[(-1, 784), (-1, 1)],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
img, label = fluid.layers.read_file(data_files)
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
batch_count = 0
while not data_files.eof():
img_val, = exe.run(fetch_list=[img])
batch_count += 1
self.assertLessEqual(img_val.shape[0], self.batch_size)
data_files.reset()
self.assertEqual(batch_count, self.num_batch * 3)
def test_main(self):
self.main(thread_num=3) # thread number equals to file number
self.main(thread_num=10) # thread number is larger than file number
self.main(thread_num=2) # thread number is less than file number
...@@ -38,14 +38,15 @@ class TestRecvOp(unittest.TestCase): ...@@ -38,14 +38,15 @@ class TestRecvOp(unittest.TestCase):
def init_serv(self, place): def init_serv(self, place):
main = fluid.Program() main = fluid.Program()
with fluid.program_guard(main): with fluid.program_guard(main):
x = layers.data( serv = layers.ListenAndServ(
shape=[32, 32], "127.0.0.1:6174", ["X"], optimizer_mode=False)
dtype='float32',
name="X",
append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block())
serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False)
with serv.do(): with serv.do():
x = layers.data(
shape=[32, 32],
dtype='float32',
name="X",
append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block())
o = layers.scale(x=x, scale=10.0) o = layers.scale(x=x, scale=10.0)
main.global_block().create_var( main.global_block().create_var(
name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape) name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
......
...@@ -33,12 +33,36 @@ class TestSequenceExpand(OpTest): ...@@ -33,12 +33,36 @@ class TestSequenceExpand(OpTest):
def compute(self): def compute(self):
x = self.inputs['X'] x = self.inputs['X']
x_data, x_lod = x if type(x) == tuple else (x, None) x_data, x_lod = x if type(x) == tuple else (x, None)
n = 1 + x_data.shape[0] if not x_lod else len(x_lod[0])
y_data, y_lod = self.inputs['Y'] y_data, y_lod = self.inputs['Y']
repeats = [((y_lod[-1][i + 1] - y_lod[-1][i]))
for i in range(len(y_lod[-1]) - 1)] if hasattr(self, 'attrs'):
out = x_data.repeat(repeats, axis=0) ref_level = self.attrs['ref_level']
self.outputs = {'Out': out} else:
ref_level = len(y_lod) - 1
out = np.zeros(shape=((0, ) + x_data.shape[1:]), dtype=x_data.dtype)
if x_lod is None:
x_idx = [i for i in xrange(x_data.shape[0] + 1)]
else:
x_idx = x_lod[0]
out_lod = [[0]]
for i in xrange(1, len(y_lod[ref_level])):
repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]
x_len = x_idx[i] - x_idx[i - 1]
if repeat_num > 0:
x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
x_sub = np.repeat(x_sub, repeat_num, axis=0)
out = np.vstack((out, x_sub))
if x_lod is not None:
for j in xrange(repeat_num):
out_lod[0].append(out_lod[0][-1] + x_len)
if x_lod is None:
self.outputs = {'Out': out}
else:
self.outputs = {'Out': (out, out_lod)}
def setUp(self): def setUp(self):
self.op_type = 'sequence_expand' self.op_type = 'sequence_expand'
...@@ -51,43 +75,44 @@ class TestSequenceExpand(OpTest): ...@@ -51,43 +75,44 @@ class TestSequenceExpand(OpTest):
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
# class TestSequenceExpandCase1(TestSequenceExpand):
# def set_data(self): class TestSequenceExpandCase1(TestSequenceExpand):
# x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32') def set_data(self):
# x_lod = [[0, 2, 5]] x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
# y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32') x_lod = [[0, 2, 5]]
# y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]] y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
# self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
# class TestSequenceExpandCase2(TestSequenceExpand): self.attrs = {'ref_level': 0}
# def set_data(self):
# x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
# x_lod = [[0, 1]] class TestSequenceExpandCase2(TestSequenceExpand):
# y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32') def set_data(self):
# y_lod = [[0, 2]] x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
# self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} x_lod = [[0, 1]]
y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
# class TestSequenceExpandCase3(TestSequenceExpand): y_lod = [[0, 2], [0, 2]]
# def set_data(self): self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
# x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32') self.attrs = {'ref_level': 0}
# x_lod = [[0, 1, 2, 3, 4]]
# y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
# y_lod = [[0, 2, 4, 4, 6]] class TestSequenceExpandCase3(TestSequenceExpand):
# self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} def set_data(self):
x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
# class TestSequenceExpandCase4(TestSequenceExpand): x_lod = [[0, 1, 2, 3, 4]]
# def set_data(self): y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
# x_data = np.array( y_lod = [[0, 2, 4, 4, 6]]
# [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape( self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
# [2, 5]).astype('float32')
# x_lod = [[
# 0, class TestSequenceExpandCase4(TestSequenceExpand):
# 1, def set_data(self):
# 2, data = [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]
# ]] x_data = np.array(data).reshape([5, 2]).astype('float32')
# y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32') x_lod = [[0, 2, 5]]
# y_lod = [[0, 1, 2], [0, 1, 2]] y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
# self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} y_lod = [[0, 1, 2], [0, 1, 2]]
self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -29,15 +29,20 @@ class TestSoftmaxOp(OpTest): ...@@ -29,15 +29,20 @@ class TestSoftmaxOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softmax" self.op_type = "softmax"
self.use_cudnn = False self.use_cudnn = False
self.inputs = { self.use_mkldnn = False
'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32") self.dtype = np.float32
} self.init_kernel_type()
self.outputs = {
'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X']) x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
out = np.apply_along_axis(stable_softmax, 1, x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
self.attrs = {
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn
} }
self.attrs = {'use_cudnn': self.use_cudnn, }
def init_op_type(self): def init_kernel_type(self):
pass pass
def test_check_output(self): def test_check_output(self):
...@@ -48,6 +53,8 @@ class TestSoftmaxOp(OpTest): ...@@ -48,6 +53,8 @@ class TestSoftmaxOp(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
if self.use_cudnn: if self.use_cudnn:
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -57,8 +64,25 @@ class TestSoftmaxOp(OpTest): ...@@ -57,8 +64,25 @@ class TestSoftmaxOp(OpTest):
class TestSoftmaxCUDNNOp(TestSoftmaxOp): class TestSoftmaxCUDNNOp(TestSoftmaxOp):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True
class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
def init_kernel_type(self):
self.use_mkldnn = True
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestSplitIdsOp(OpTest):
def setUp(self):
self.op_type = "split_ids"
ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
out0 = np.array([[0], [3], [6]]).astype('int64')
out1 = np.array([[]]).astype('int64')
out2 = np.array([[2], [2], [5], [5]]).astype('int64')
self.inputs = {'Ids': ids}
self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
unittest.main()
...@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase): ...@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase):
def test_lod_tensor_gpu_init(self): def test_lod_tensor_gpu_init(self):
if not core.is_compiled_with_cuda(): if not core.is_compiled_with_cuda():
return return
scope = core.Scope()
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
lod_py = [[0, 2, 5], [0, 2, 4, 5]] lod_py = [[0, 2, 5], [0, 2, 4, 5]]
lod_tensor = core.LoDTensor() lod_tensor = core.LoDTensor()
...@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase): ...@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase):
self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
self.assertListEqual(lod_py, lod_tensor.lod()) self.assertListEqual(lod_py, lod_tensor.lod())
def test_empty_tensor(self):
place = core.CPUPlace()
scope = core.Scope()
var = scope.var("test_tensor")
tensor = var.get_tensor()
tensor.set_dims([0, 1])
tensor.alloc_float(place)
tensor_array = numpy.array(tensor)
self.assertEqual((0, 1), tensor_array.shape)
if core.is_compiled_with_cuda():
gpu_place = core.CUDAPlace(0)
tensor.alloc_float(gpu_place)
tensor_array = numpy.array(tensor)
self.assertEqual((0, 1), tensor_array.shape)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation): ...@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation):
.. math:: .. math::
P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} } P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} }
""" """
def __init__(self): def __init__(self):
......
...@@ -2747,17 +2747,17 @@ def img_pool_layer(input, ...@@ -2747,17 +2747,17 @@ def img_pool_layer(input,
.. math:: .. math::
w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} w & = 1 + ceil(\\frac{input\_width + 2 * padding - pool\_size}{stride})
h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} h & = 1 + ceil(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
- ceil_mode=False: - ceil_mode=False:
.. math:: .. math::
w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride} w & = 1 + floor(\\frac{input\_width + 2 * padding - pool\_size}{stride})
h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} h & = 1 + floor(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
The example usage is: The example usage is:
......
...@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could ...@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
be used in user program. be used in user program.
""" """
__all__ = ['np_array', 'text_file', "cloud_reader"] __all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
def np_array(x): def np_array(x):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册