未验证 提交 3777f102 编写于 作者: Y yuyang18

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into pr/11812

...@@ -61,8 +61,10 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) ...@@ -61,8 +61,10 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
...@@ -131,6 +133,10 @@ if (NOT DEFINED WITH_MKLDNN) ...@@ -131,6 +133,10 @@ if (NOT DEFINED WITH_MKLDNN)
set(WITH_MKLDNN OFF) set(WITH_MKLDNN OFF)
endif() endif()
endif() endif()
if (REPLACE_ENFORCE_GLOG)
add_definitions("-DREPLACE_ENFORCE_GLOG")
endif()
######################################################################################## ########################################################################################
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
...@@ -153,12 +159,24 @@ include(external/cares) ...@@ -153,12 +159,24 @@ include(external/cares)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GRPC) if(WITH_GRPC)
include(external/grpc) include(external/grpc)
message(STATUS "Use grpc framework.")
else() else()
message(STATUS "Use brpc framework.")
include(external/leveldb) include(external/leveldb)
include(external/brpc) include(external/brpc)
endif() endif()
endif() endif()
if(WITH_BRPC_RDMA)
message(STATUS "Use brpc with rdma.")
if(WITH_GRPC)
message(FATAL_ERROR "Can't use grpc with brpc rdma.")
endif()
if(NOT WITH_DISTRIBUTE)
message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
endif()
endif()
include(external/snappy) # download snappy include(external/snappy) # download snappy
include(external/snappystream) include(external/snappystream)
include(external/threadpool) include(external/threadpool)
...@@ -178,7 +196,7 @@ include(inference_lib) # add paddle fluid inference libraries ...@@ -178,7 +196,7 @@ include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}") include_directories("${PADDLE_SOURCE_DIR}")
include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include") include_directories("${PADDLE_SOURCE_DIR}/paddle/legacy/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
...@@ -222,7 +240,7 @@ add_subdirectory(proto) ...@@ -222,7 +240,7 @@ add_subdirectory(proto)
if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY) if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
# "add_subdirectory(go)" should be placed after the following loine, # "add_subdirectory(go)" should be placed after the following loine,
# because it depends on paddle/optimizer. # because it depends on paddle/optimizer.
add_subdirectory(paddle/optimizer) add_subdirectory(paddle/legacy/optimizer)
endif() endif()
# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
......
...@@ -159,4 +159,4 @@ This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the ...@@ -159,4 +159,4 @@ This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the
- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework) - verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) - verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform) - verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math) - verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
......
...@@ -125,6 +125,10 @@ def parse_args(): ...@@ -125,6 +125,10 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--use_inference_transpiler', '--use_inference_transpiler',
action='store_true', action='store_true',
help='If set, uses inference transpiler to optimize the program.') help='If set, use inference transpiler to optimize the program.')
parser.add_argument(
'--no_random',
action='store_true',
help='If set, keep the random seed and do not shuffle the data.')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -132,10 +132,6 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, ...@@ -132,10 +132,6 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
exe.run(startup_prog) exe.run(startup_prog)
# Use inference_transpiler to speedup # Use inference_transpiler to speedup
if args.use_inference_transpiler:
t = fluid.InferenceTranspiler()
t.transpile(infer_prog, place)
if not args.use_reader_op: if not args.use_reader_op:
feed_var_list = [ feed_var_list = [
var for var in train_prog.global_block().vars.itervalues() var for var in train_prog.global_block().vars.itervalues()
...@@ -186,6 +182,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, ...@@ -186,6 +182,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))), print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
# evaluation # evaluation
if not args.no_test and batch_acc and not args.use_reader_op: if not args.no_test and batch_acc and not args.use_reader_op:
if args.use_inference_transpiler:
t = fluid.InferenceTranspiler()
t.transpile(infer_prog, place)
pass_test_acc = test(exe, infer_prog, test_reader, feeder, pass_test_acc = test(exe, infer_prog, test_reader, feeder,
batch_acc) batch_acc)
print(", Test Accuracy: %f" % pass_test_acc) print(", Test Accuracy: %f" % pass_test_acc)
...@@ -316,6 +316,8 @@ def main(): ...@@ -316,6 +316,8 @@ def main():
args = parse_args() args = parse_args()
print_arguments(args) print_arguments(args)
print_paddle_envs() print_paddle_envs()
if args.no_random:
fluid.default_startup_program().random_seed = 1
# the unique trainer id, starting from 0, needed by trainer # the unique trainer id, starting from 0, needed by trainer
# only # only
......
...@@ -197,12 +197,12 @@ def get_model(args): ...@@ -197,12 +197,12 @@ def get_model(args):
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
batched_train_reader = paddle.batch( batched_train_reader = paddle.batch(
paddle.reader.shuffle( train_reader if args.no_random else paddle.reader.shuffle(
train_reader, buf_size=5120), train_reader, buf_size=5120),
batch_size=args.batch_size * args.gpus, batch_size=args.batch_size * args.gpus,
drop_last=True) drop_last=True)
batched_test_reader = paddle.batch( batched_test_reader = paddle.batch(
train_reader, batch_size=args.batch_size, drop_last=True) test_reader, batch_size=args.batch_size, drop_last=True)
return avg_cost, inference_program, optimizer, batched_train_reader,\ return avg_cost, inference_program, optimizer, batched_train_reader,\
batched_test_reader, batch_acc batched_test_reader, batch_acc
...@@ -174,3 +174,7 @@ endif(WITH_GOLANG) ...@@ -174,3 +174,7 @@ endif(WITH_GOLANG)
if(WITH_GRPC) if(WITH_GRPC)
add_definitions(-DPADDLE_WITH_GRPC) add_definitions(-DPADDLE_WITH_GRPC)
endif(WITH_GRPC) endif(WITH_GRPC)
if(WITH_BRPC_RDMA)
add_definitions(-DPADDLE_WITH_BRPC_RDMA)
endif(WITH_BRPC_RDMA)
...@@ -14,6 +14,15 @@ ...@@ -14,6 +14,15 @@
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
find_library(SSL_LIBRARY NAMES ssl)
ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
find_library(CRYPTO_LIBRARY NAMES crypto)
ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE) SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
...@@ -22,14 +31,14 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr ...@@ -22,14 +31,14 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf") set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF # If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF
ExternalProject_Add( ExternalProject_Add(
extern_brpc extern_brpc
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/brpc/brpc" GIT_REPOSITORY "https://github.com/gongweibao/brpc"
GIT_TAG "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2" GIT_TAG "7dc04defad1fd4173aae170c3fcbde131b65155a"
PREFIX ${BRPC_SOURCES_DIR} PREFIX ${BRPC_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
...@@ -42,6 +51,8 @@ ExternalProject_Add( ...@@ -42,6 +51,8 @@ ExternalProject_Add(
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_PREFIX_PATH=${prefix_path} -DCMAKE_PREFIX_PATH=${prefix_path}
-DBRPC_WITH_GLOG=ON -DBRPC_WITH_GLOG=ON
-DIOBUF_WITH_HUGE_BLOCK=ON
-DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
${EXTERNAL_OPTIONAL_ARGS} ${EXTERNAL_OPTIONAL_ARGS}
LIST_SEPARATOR | LIST_SEPARATOR |
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
...@@ -49,7 +60,7 @@ ExternalProject_Add( ...@@ -49,7 +60,7 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
) )
ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy) ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc) ADD_DEPENDENCIES(brpc extern_brpc)
......
...@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID) ...@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID) endif(NOT APPLE AND NOT ANDROID)
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
# for building inference libs
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
function(merge_static_libs TARGET_NAME) function(merge_static_libs TARGET_NAME)
set(libs ${ARGN}) set(libs ${ARGN})
list(REMOVE_DUPLICATES libs) list(REMOVE_DUPLICATES libs)
...@@ -250,6 +264,7 @@ function(cc_test TARGET_NAME) ...@@ -250,6 +264,7 @@ function(cc_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (${cc_test_SERIAL}) if (${cc_test_SERIAL})
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif() endif()
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -314,6 +329,7 @@ function(nv_test TARGET_NAME) ...@@ -314,6 +329,7 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
if (nv_test_SERIAL) if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif() endif()
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -561,7 +577,7 @@ function(py_test TARGET_NAME) ...@@ -561,7 +577,7 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS) set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
......
...@@ -12,19 +12,6 @@ ...@@ -12,19 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
# make package for paddle fluid shared and static library # make package for paddle fluid shared and static library
function(copy TARGET) function(copy TARGET)
set(options "") set(options "")
...@@ -154,7 +141,7 @@ set(inference_deps paddle_fluid_shared paddle_fluid) ...@@ -154,7 +141,7 @@ set(inference_deps paddle_fluid_shared paddle_fluid)
if(WITH_CONTRIB) if(WITH_CONTRIB)
message(STATUS "installing contrib") message(STATUS "installing contrib")
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference") set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
if (WITH_ANAKIN) if (WITH_ANAKIN AND WITH_GPU)
copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS SRCS
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
...@@ -163,9 +150,9 @@ if(WITH_CONTRIB) ...@@ -163,9 +150,9 @@ if(WITH_CONTRIB)
list(APPEND inference_deps contrib_anakin_inference_lib) list(APPEND inference_deps contrib_anakin_inference_lib)
endif() endif()
copy(contrib_inference_lib DEPS paddle_inference_api copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.* ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
DSTS ${contrib_dst_dir} ${contrib_dst_dir}) DSTS ${contrib_dst_dir} ${contrib_dst_dir})
list(APPEND inference_deps contrib_inference_lib) list(APPEND inference_deps contrib_inference_lib)
endif() endif()
......
...@@ -1468,6 +1468,14 @@ argmax ...@@ -1468,6 +1468,14 @@ argmax
.. autofunction:: paddle.fluid.layers.argmax .. autofunction:: paddle.fluid.layers.argmax
:noindex: :noindex:
.. _api_fluid_layers_argsort:
argsort
-------
.. autofunction:: paddle.fluid.layers.argsort
:noindex:
.. _api_fluid_layers_ones: .. _api_fluid_layers_ones:
ones ones
......
...@@ -74,10 +74,10 @@ void OperatorWithKernel::Run( ...@@ -74,10 +74,10 @@ void OperatorWithKernel::Run(
auto kernel_type_for_var = this->GetKernelTypeForVar(...); auto kernel_type_for_var = this->GetKernelTypeForVar(...);
if (kernel_type_for_var.place_ != expected_kernel_key.place_) { if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
auto* trans_var = new_scope.Var(var_name); auto* trans_var = new_scope.Var(var_name);
auto* out = DataTransform(expected_kernel_key, auto* out = TransformData(expected_kernel_key,
kernel_type_for_var, kernel_type_for_var,
*tensor_in); *tensor_in);
CopyVariableWithTensor(...); SetTensorToVariable(...);
} }
} }
......
...@@ -65,7 +65,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix matrix, ...@@ -65,7 +65,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
而在CPP里面实现这个C的接口,文件 `paddle_matrix.cpp` 而在CPP里面实现这个C的接口,文件 `paddle_matrix.cpp`
```cpp ```cpp
#include "paddle/math/matrix.h" #include "paddle/legacy/math/matrix.h"
extern "C" extern "C"
paddle_error paddle_matrix_shape(paddle_matrix matrix, paddle_error paddle_matrix_shape(paddle_matrix matrix,
uint64_t *width, uint64_t *width,
......
...@@ -58,7 +58,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。 ...@@ -58,7 +58,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
实现C++类 实现C++类
=================== ===================
一个网络层的C++类需要实现初始化,前向和后向。全连接层的实现位于:code:`paddle/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。 一个网络层的C++类需要实现初始化,前向和后向。全连接层的实现位于:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
这个类需要继承 :code:`paddle::Layer` 这个基类,并且需要重写基类中的以下几个虚函数: 这个类需要继承 :code:`paddle::Layer` 这个基类,并且需要重写基类中的以下几个虚函数:
...@@ -153,7 +153,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。 ...@@ -153,7 +153,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
- 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。 - 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
- 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小,所以这一步是必要的。 :code:`reserveOutput` 会相应地改变输出的尺寸。为了保证效率,如果需要扩大矩阵,我们会重新分配内存;如果需要缩减矩阵,我们会继续使用现有的内存块。 - 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小,所以这一步是必要的。 :code:`reserveOutput` 会相应地改变输出的尺寸。为了保证效率,如果需要扩大矩阵,我们会重新分配内存;如果需要缩减矩阵,我们会继续使用现有的内存块。
- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵,每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作,请参考 :code:`paddle/math/Matrix.h`和:code:`paddle/math/BaseMatrix.h` 。 - 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵,每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作,请参考 :code:`paddle/legacy/math/Matrix.h`和:code:`paddle/legacy/math/BaseMatrix.h` 。
- 最终,使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。 - 最终,使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
...@@ -262,7 +262,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。 ...@@ -262,7 +262,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
REGISTER_LAYER(fc, FullyConnectedLayer); REGISTER_LAYER(fc, FullyConnectedLayer);
} }
若 :code:`cpp` 被放在 :code:`paddle/gserver/layers` 目录下,其会自动被加入编译列表。 若 :code:`cpp` 被放在 :code:`paddle/legacy/gserver/layers` 目录下,其会自动被加入编译列表。
写梯度检查单元测试 写梯度检查单元测试
...@@ -270,7 +270,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。 ...@@ -270,7 +270,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ,然后观察到输出的变化为 :math:`\Delta y` ,那么,梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后,再用这个梯度去和 :code:`backward` 函数得到的梯度去对比,以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算,并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。 写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ,然后观察到输出的变化为 :math:`\Delta y` ,那么,梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后,再用这个梯度去和 :code:`backward` 函数得到的梯度去对比,以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算,并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
所有网络层的梯度检查单测都位于 :code:`paddle/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步: 所有网络层的梯度检查单测都位于 :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步:
+ 生成网络层配置。网络层配置包含以下几项: + 生成网络层配置。网络层配置包含以下几项:
- 偏置参数的大小。(例子中是4096) - 偏置参数的大小。(例子中是4096)
...@@ -322,7 +322,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。 ...@@ -322,7 +322,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
} }
} }
如果你要为了测试而增加新的文件,例如 :code:`paddle/gserver/tests/testFCGrad.cpp` ,你需要把该文件加入 :code:`paddle/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时,所有的单测都会被执行一次。注意,有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。 如果你要为了测试而增加新的文件,例如 :code:`paddle/legacy/gserver/tests/testFCGrad.cpp` ,你需要把该文件加入 :code:`paddle/legacy/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时,所有的单测都会被执行一次。注意,有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
.. code-block:: bash .. code-block:: bash
......
...@@ -58,7 +58,7 @@ Finally we can use chain rule to calculate :math:`\frac{\partial z}{\partial x}` ...@@ -58,7 +58,7 @@ Finally we can use chain rule to calculate :math:`\frac{\partial z}{\partial x}`
Implement C++ Class Implement C++ Class
=================== ===================
The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below. The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions: It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
...@@ -154,7 +154,7 @@ The implementation of the forward part has the following steps. ...@@ -154,7 +154,7 @@ The implementation of the forward part has the following steps.
- Every layer must call :code:`Layer::forward(passType);` at the beginning of its :code:`forward` function. - Every layer must call :code:`Layer::forward(passType);` at the beginning of its :code:`forward` function.
- Then it allocates memory for the output using :code:`reserveOutput(batchSize, size);`. This step is necessary because we support the batches to have different batch sizes. :code:`reserveOutput` will change the size of the output accordingly. For the sake of efficiency, we will allocate new memory if we want to expand the matrix, but we will reuse the existing memory block if we want to shrink the matrix. - Then it allocates memory for the output using :code:`reserveOutput(batchSize, size);`. This step is necessary because we support the batches to have different batch sizes. :code:`reserveOutput` will change the size of the output accordingly. For the sake of efficiency, we will allocate new memory if we want to expand the matrix, but we will reuse the existing memory block if we want to shrink the matrix.
- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/math/Matrix.h` and :code:`paddle/math/BaseMatrix.h`. - Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/legacy/math/Matrix.h` and :code:`paddle/legacy/math/BaseMatrix.h`.
- Finally it applies the activation function using :code:`forwardActivation();`. It will automatically applies the corresponding activation function specifies in the network configuration. - Finally it applies the activation function using :code:`forwardActivation();`. It will automatically applies the corresponding activation function specifies in the network configuration.
...@@ -263,7 +263,7 @@ Finally, you can use :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` to registe ...@@ -263,7 +263,7 @@ Finally, you can use :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` to registe
REGISTER_LAYER(fc, FullyConnectedLayer); REGISTER_LAYER(fc, FullyConnectedLayer);
} }
If the :code:`cpp` file is put into :code:`paddle/gserver/layers`, it will be automatically added to the compilation list. If the :code:`cpp` file is put into :code:`paddle/legacy/gserver/layers`, it will be automatically added to the compilation list.
Write Gradient Check Unit Test Write Gradient Check Unit Test
...@@ -271,7 +271,7 @@ Write Gradient Check Unit Test ...@@ -271,7 +271,7 @@ Write Gradient Check Unit Test
An easy way to verify the correctness of new layer's implementation is to write a gradient check unit test. Gradient check unit test utilizes finite difference method to verify the gradient of a layer. It modifies the input with a small perturbation :math:`\Delta x` and observes the changes of output :math:`\Delta y`, the gradient can be computed as :math:`\frac{\Delta y}{\Delta x }`. This gradient can be compared with the gradient computed by the :code:`backward` function of the layer to ensure the correctness of the gradient computation. Notice that the gradient check only tests the correctness of the gradient computation, it does not necessarily guarantee the correctness of the implementation of the :code:`forward` and :code:`backward` function. You need to write more sophisticated unit tests to make sure your layer is implemented correctly. An easy way to verify the correctness of new layer's implementation is to write a gradient check unit test. Gradient check unit test utilizes finite difference method to verify the gradient of a layer. It modifies the input with a small perturbation :math:`\Delta x` and observes the changes of output :math:`\Delta y`, the gradient can be computed as :math:`\frac{\Delta y}{\Delta x }`. This gradient can be compared with the gradient computed by the :code:`backward` function of the layer to ensure the correctness of the gradient computation. Notice that the gradient check only tests the correctness of the gradient computation, it does not necessarily guarantee the correctness of the implementation of the :code:`forward` and :code:`backward` function. You need to write more sophisticated unit tests to make sure your layer is implemented correctly.
All the gradient check unit tests are located in :code:`paddle/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps. All the gradient check unit tests are located in :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps.
+ Create layer configuration. A layer configuration can include the following attributes: + Create layer configuration. A layer configuration can include the following attributes:
- size of the bias parameter. (4096 in our example) - size of the bias parameter. (4096 in our example)
...@@ -323,7 +323,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes ...@@ -323,7 +323,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
} }
} }
If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake. If you are creating a new file for the test, such as :code:`paddle/legacy/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/legacy/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
.. code-block:: bash .. code-block:: bash
......
...@@ -196,6 +196,6 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数 ...@@ -196,6 +196,6 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
obj="process", obj="process",
args={"src_dict_path": src_dict_path}) args={"src_dict_path": src_dict_path})
完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ 示例。 完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_recurrent.py>`_ 示例。
...@@ -50,12 +50,12 @@ GPU则还需要高并行性,才能发挥其全部能力。这正是它们速 ...@@ -50,12 +50,12 @@ GPU则还需要高并行性,才能发挥其全部能力。这正是它们速
**nvprof** 是Nvidia性能分析工具, **nvvp** 则是带GUI的Nvidia可视化性能分析工具。 **nvprof** 是Nvidia性能分析工具, **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
在这个教程中,我们主要会介绍nvprof和nvvp。 在这个教程中,我们主要会介绍nvprof和nvvp。
:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate :code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
above profilers. above profilers.
:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。 :code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
:language: c++ :language: c++
:lines: 137-151 :lines: 137-151
:linenos: :linenos:
...@@ -83,7 +83,7 @@ program crashes when CPU version of PaddlePaddle invokes them. ...@@ -83,7 +83,7 @@ program crashes when CPU version of PaddlePaddle invokes them.
1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数(如高亮部分)。 1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数(如高亮部分)。
.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
:language: c++ :language: c++
:lines: 137-151 :lines: 137-151
:emphasize-lines: 8-12,14 :emphasize-lines: 8-12,14
...@@ -101,8 +101,8 @@ program crashes when CPU version of PaddlePaddle invokes them. ...@@ -101,8 +101,8 @@ program crashes when CPU version of PaddlePaddle invokes them.
.. code-block:: bash .. code-block:: bash
:emphasize-lines: 1,12-15 :emphasize-lines: 1,12-15
> ./paddle/math/tests/test_GpuProfiler > ./paddle/legacy/math/tests/test_GpuProfiler
I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done. I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
[==========] Running 1 test from 1 test case. [==========] Running 1 test from 1 test case.
...@@ -130,7 +130,7 @@ nvprof 工具 ...@@ -130,7 +130,7 @@ nvprof 工具
1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中(参考强调部分)。 1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中(参考强调部分)。
.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
:language: c++ :language: c++
:lines: 137-151 :lines: 137-151
:emphasize-lines: 6-7 :emphasize-lines: 6-7
...@@ -147,13 +147,13 @@ nvprof 工具 ...@@ -147,13 +147,13 @@ nvprof 工具
.. code-block:: bash .. code-block:: bash
nvprof ./paddle/math/tests/test_GpuProfiler nvprof ./paddle/legacy/math/tests/test_GpuProfiler
然后,您就能获得如下的分析结果: 然后,您就能获得如下的分析结果:
.. code-block:: bash .. code-block:: bash
==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
==78544== Profiling result: ==78544== Profiling result:
Time(%) Time Calls Avg Min Max Name Time(%) Time Calls Avg Min Max Name
27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD] 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD]
......
...@@ -51,10 +51,10 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th ...@@ -51,10 +51,10 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th
**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler. **nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
In this tutorial, we will focus on nvprof and nvvp. In this tutorial, we will focus on nvprof and nvvp.
:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate :code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
above profilers. above profilers.
.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
:language: c++ :language: c++
:lines: 137-151 :lines: 137-151
:linenos: :linenos:
...@@ -80,7 +80,7 @@ As a simple example, consider the following: ...@@ -80,7 +80,7 @@ As a simple example, consider the following:
1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines). 1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
:language: c++ :language: c++
:lines: 137-151 :lines: 137-151
:emphasize-lines: 8-12,14 :emphasize-lines: 8-12,14
...@@ -98,8 +98,8 @@ As a simple example, consider the following: ...@@ -98,8 +98,8 @@ As a simple example, consider the following:
.. code-block:: bash .. code-block:: bash
:emphasize-lines: 1,12-15 :emphasize-lines: 1,12-15
> ./paddle/math/tests/test_GpuProfiler > ./paddle/legacy/math/tests/test_GpuProfiler
I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done. I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
[==========] Running 1 test from 1 test case. [==========] Running 1 test from 1 test case.
...@@ -127,7 +127,7 @@ To use this command line profiler **nvprof**, you can simply issue the following ...@@ -127,7 +127,7 @@ To use this command line profiler **nvprof**, you can simply issue the following
1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines). 1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
:language: c++ :language: c++
:lines: 137-151 :lines: 137-151
:emphasize-lines: 6-7 :emphasize-lines: 6-7
...@@ -144,13 +144,13 @@ To use this command line profiler **nvprof**, you can simply issue the following ...@@ -144,13 +144,13 @@ To use this command line profiler **nvprof**, you can simply issue the following
.. code-block:: bash .. code-block:: bash
nvprof ./paddle/math/tests/test_GpuProfiler nvprof ./paddle/legacy/math/tests/test_GpuProfiler
Then, you can get the following profiling result: Then, you can get the following profiling result:
.. code-block:: bash .. code-block:: bash
==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
==78544== Profiling result: ==78544== Profiling result:
Time(%) Time Calls Avg Min Max Name Time(%) Time Calls Avg Min Max Name
27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD] 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD]
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
单双层RNN API对比介绍 单双层RNN API对比介绍
##################### #####################
本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。 本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
示例1:双层RNN,子序列间无Memory 示例1:双层RNN,子序列间无Memory
================================ ================================
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
在本示例中,单层RNN和双层RNN的网络配置,都是将每一句分好词后的句子,使用LSTM作为encoder,压缩成一个向量。区别是RNN使用两层序列模型,将多句话看成一个整体同时使用encoder压缩。二者语意上完全一致。这组语义相同的示例配置如下: 在本示例中,单层RNN和双层RNN的网络配置,都是将每一句分好词后的句子,使用LSTM作为encoder,压缩成一个向量。区别是RNN使用两层序列模型,将多句话看成一个整体同时使用encoder压缩。二者语意上完全一致。这组语义相同的示例配置如下:
* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_ * 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_ * 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
读取双层序列数据 读取双层序列数据
...@@ -24,18 +24,18 @@ ...@@ -24,18 +24,18 @@
- 本例中的原始数据一共有10个样本。每个样本由两部分组成,一个label(此处都为2)和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。 - 本例中的原始数据一共有10个样本。每个样本由两部分组成,一个label(此处都为2)和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。
.. literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg .. literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
:language: text :language: text
- 双层序列数据一共有4个样本。 每个样本间用空行分开,整体数据和原始数据完全一样。但于双层序列的LSTM来说,第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。 - 双层序列数据一共有4个样本。 每个样本间用空行分开,整体数据和原始数据完全一样。但于双层序列的LSTM来说,第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。
.. literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest .. literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
:language: text :language: text
其次,对于两种不同的输入数据类型,不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\: 其次,对于两种不同的输入数据类型,不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\:
.. literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
:language: python :language: python
:lines: 21-39 :lines: 21-39
:linenos: :linenos:
...@@ -47,7 +47,7 @@ ...@@ -47,7 +47,7 @@
- words是原始数据中的每一句话,所对应的词表index数组。它是integer_value_sequence类型的,即整数数组。words即为这个数据中的单层时间序列。 - words是原始数据中的每一句话,所对应的词表index数组。它是integer_value_sequence类型的,即整数数组。words即为这个数据中的单层时间序列。
- label是原始数据中对于每一句话的分类标签,它是integer_value类型的。 - label是原始数据中对于每一句话的分类标签,它是integer_value类型的。
.. literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
:language: python :language: python
:lines: 42-71 :lines: 42-71
:linenos: :linenos:
...@@ -64,7 +64,7 @@ ...@@ -64,7 +64,7 @@
首先,我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中,RNN对于每一个时间步通过了一个LSTM网络。 首先,我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中,RNN对于每一个时间步通过了一个LSTM网络。
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
:language: python :language: python
:lines: 38-63 :lines: 38-63
:linenos: :linenos:
...@@ -85,7 +85,7 @@ ...@@ -85,7 +85,7 @@
* 至此,\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。 * 至此,\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
:language: python :language: python
:lines: 38-64 :lines: 38-64
:linenos: :linenos:
...@@ -107,7 +107,7 @@ ...@@ -107,7 +107,7 @@
- 单层RNN:过了一个很简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全链接。 - 单层RNN:过了一个很简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
:language: python :language: python
:lines: 36-48 :lines: 36-48
...@@ -116,7 +116,7 @@ ...@@ -116,7 +116,7 @@
- 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem,表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中,outer_mem是一个子句的最后一个向量,即整个双层group是将前一个子句的最后一个向量,作为下一个子句memory的初始状态。 - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem,表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中,outer_mem是一个子句的最后一个向量,即整个双层group是将前一个子句的最后一个向量,作为下一个子句memory的初始状态。
- 从输入数据上看,单双层序列的句子是一样的,只是双层序列将其又做了子序列划分。因此双层序列的配置中,必须将前一个子句的最后一个元素,作为boot_layer传给下一个子句的memory,才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。 - 从输入数据上看,单双层序列的句子是一样的,只是双层序列将其又做了子序列划分。因此双层序列的配置中,必须将前一个子句的最后一个元素,作为boot_layer传给下一个子句的memory,才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
:language: python :language: python
:lines: 39-66 :lines: 39-66
...@@ -134,7 +134,7 @@ ...@@ -134,7 +134,7 @@
**输入不等长** 是指recurrent_group的多个输入序列,在每个时间步的子序列长度可以不相等。但序列输出时,需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致,默认指定第一个输入。 **输入不等长** 是指recurrent_group的多个输入序列,在每个时间步的子序列长度可以不相等。但序列输出时,需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致,默认指定第一个输入。
示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。 示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
示例3对于单层RNN和双层RNN数据完全相同。 示例3对于单层RNN和双层RNN数据完全相同。
...@@ -152,14 +152,14 @@ ...@@ -152,14 +152,14 @@
* 单层RNN\: * 单层RNN\:
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
:language: python :language: python
:lines: 42-59 :lines: 42-59
:linenos: :linenos:
* 双层RNN\ \: * 双层RNN\ \:
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
:language: python :language: python
:lines: 41-80 :lines: 41-80
:linenos: :linenos:
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
API comparision between RNN and hierarchical RNN API comparision between RNN and hierarchical RNN
##################### #####################
This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。 This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
Example 1:Hierarchical RNN without Memory between subsequences Example 1:Hierarchical RNN without Memory between subsequences
================================ ================================
...@@ -13,8 +13,8 @@ The classical case in the hierarchical RNN is to perform sequence operations on ...@@ -13,8 +13,8 @@ The classical case in the hierarchical RNN is to perform sequence operations on
In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows: In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows:
* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_ * RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_ * Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
Reading hierarchical sequence data Reading hierarchical sequence data
...@@ -24,18 +24,18 @@ Firstly, the original data in this example is as follows \: ...@@ -24,18 +24,18 @@ Firstly, the original data in this example is as follows \:
- The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. - The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well.
.. literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg .. literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
:language: text :language: text
- The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ . - The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
.. literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest .. literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
:language: text :language: text
Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\: Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\:
.. literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
:language: python :language: python
:lines: 21-39 :lines: 21-39
:linenos: :linenos:
...@@ -47,7 +47,7 @@ Secondly, as for these two types of different input data formats, the contrast o ...@@ -47,7 +47,7 @@ Secondly, as for these two types of different input data formats, the contrast o
- "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data.
- "label" is the categorical label of each sentence, whose data type is integer_value. - "label" is the categorical label of each sentence, whose data type is integer_value.
.. literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
:language: python :language: python
:lines: 42-71 :lines: 42-71
:linenos: :linenos:
...@@ -64,7 +64,7 @@ Model configuration ...@@ -64,7 +64,7 @@ Model configuration
Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network.
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
:language: python :language: python
:lines: 38-63 :lines: 38-63
:linenos: :linenos:
...@@ -85,7 +85,7 @@ Secondly, let's look at the model configuration of hierarchical RNN which has th ...@@ -85,7 +85,7 @@ Secondly, let's look at the model configuration of hierarchical RNN which has th
* Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. * Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration.
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
:language: python :language: python
:lines: 38-64 :lines: 38-64
:linenos: :linenos:
...@@ -107,7 +107,7 @@ We select the different parts between single-layer RNN and hierarchical RNN conf ...@@ -107,7 +107,7 @@ We select the different parts between single-layer RNN and hierarchical RNN conf
- single-layer RNN:passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. - single-layer RNN:passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer.
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
:language: python :language: python
:lines: 36-48 :lines: 36-48
...@@ -116,7 +116,7 @@ We select the different parts between single-layer RNN and hierarchical RNN conf ...@@ -116,7 +116,7 @@ We select the different parts between single-layer RNN and hierarchical RNN conf
- The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory.
- From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration.
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
:language: python :language: python
:lines: 39-66 :lines: 39-66
...@@ -134,7 +134,7 @@ Example 3:hierarchical RNN with unequal length inputs ...@@ -134,7 +134,7 @@ Example 3:hierarchical RNN with unequal length inputs
**unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. **unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input.
The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ . The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ .
The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same.
...@@ -152,14 +152,14 @@ Similar to Example 2's configuration, Example 3's configuration uses single-laye ...@@ -152,14 +152,14 @@ Similar to Example 2's configuration, Example 3's configuration uses single-laye
* single-layer RNN\: * single-layer RNN\:
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
:language: python :language: python
:lines: 42-59 :lines: 42-59
:linenos: :linenos:
* hierarchical RNN\ \: * hierarchical RNN\ \:
.. literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py .. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
:language: python :language: python
:lines: 41-80 :lines: 41-80
:linenos: :linenos:
......
...@@ -16,7 +16,7 @@ package pserver ...@@ -16,7 +16,7 @@ package pserver
// #cgo CFLAGS: -I ../../ // #cgo CFLAGS: -I ../../
// #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
// #include "paddle/optimizer/optimizer.h" // #include "paddle/legacy/optimizer/optimizer.h"
// #include <stdlib.h> // #include <stdlib.h>
// #include <string.h> // #include <string.h>
import "C" import "C"
......
if(NOT WITH_FLUID_ONLY) if(NOT WITH_FLUID_ONLY)
add_subdirectory(cuda) add_subdirectory(legacy/cuda)
add_subdirectory(function) add_subdirectory(legacy/function)
add_subdirectory(utils) add_subdirectory(utils)
add_subdirectory(math) add_subdirectory(legacy/math)
add_subdirectory(gserver) add_subdirectory(legacy/gserver)
add_subdirectory(parameter) add_subdirectory(legacy/parameter)
if(MOBILE_INFERENCE) if(MOBILE_INFERENCE)
add_subdirectory(capi) add_subdirectory(capi)
else() else()
add_subdirectory(pserver) add_subdirectory(legacy/pserver)
add_subdirectory(trainer) add_subdirectory(trainer)
add_subdirectory(scripts) add_subdirectory(scripts)
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "PaddleAPI.h" #include "PaddleAPI.h"
#include "PaddleAPIPrivate.h" #include "PaddleAPIPrivate.h"
#include "paddle/parameter/Argument.h" #include "paddle/legacy/parameter/Argument.h"
size_t Arguments::getSlotNum() const { return m->outputs.size(); } size_t Arguments::getSlotNum() const { return m->outputs.size(); }
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "PaddleAPIPrivate.h" #include "PaddleAPIPrivate.h"
#include "Internal.h" #include "Internal.h"
#include "paddle/gserver/gradientmachines/NeuralNetwork.h" #include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
std::vector<int> GradientMachine::defaultParamTypes = { std::vector<int> GradientMachine::defaultParamTypes = {
PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM}; PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
......
...@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/math/Matrix.h" #include "paddle/legacy/math/Matrix.h"
#include <cstring> #include <cstring>
#include <iostream> #include <iostream>
#include "PaddleAPI.h" #include "PaddleAPI.h"
#include "paddle/math/CpuSparseMatrix.h" #include "paddle/legacy/math/CpuSparseMatrix.h"
#include "paddle/math/SparseMatrix.h" #include "paddle/legacy/math/SparseMatrix.h"
struct MatrixPrivate { struct MatrixPrivate {
std::shared_ptr<paddle::Matrix> mat; std::shared_ptr<paddle::Matrix> mat;
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/gserver/gradientmachines/GradientMachine.h" #include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
#include "paddle/utils/Common.h" #include "paddle/utils/Common.h"
#include "paddle/utils/GlobalConstants.h" #include "paddle/utils/GlobalConstants.h"
......
...@@ -14,9 +14,9 @@ limitations under the License. */ ...@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory> #include <memory>
#include "PaddleAPI.h" #include "PaddleAPI.h"
#include "paddle/gserver/evaluators/Evaluator.h" #include "paddle/legacy/gserver/evaluators/Evaluator.h"
#include "paddle/gserver/gradientmachines/GradientMachine.h" #include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
#include "paddle/parameter/ParameterUpdaterBase.h" #include "paddle/legacy/parameter/ParameterUpdaterBase.h"
#include "paddle/trainer/TrainerConfigHelper.h" #include "paddle/trainer/TrainerConfigHelper.h"
struct GradientMachinePrivate { struct GradientMachinePrivate {
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/parameter/Parameter.h" #include "paddle/legacy/parameter/Parameter.h"
#include "PaddleAPI.h" #include "PaddleAPI.h"
#include "PaddleAPIPrivate.h" #include "PaddleAPIPrivate.h"
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/parameter/ParameterOptimizer.h" #include "paddle/legacy/parameter/ParameterOptimizer.h"
#include <algorithm> #include <algorithm>
#include "Internal.h" #include "Internal.h"
#include "PaddleAPI.h" #include "PaddleAPI.h"
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include <sstream> #include <sstream>
#include <vector> #include <vector>
#include "PaddleAPI.h" #include "PaddleAPI.h"
#include "paddle/gserver/gradientmachines/GradientMachine.h" #include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
#include "paddle/parameter/Argument.h" #include "paddle/legacy/parameter/Argument.h"
#include "paddle/utils/Flags.h" #include "paddle/utils/Flags.h"
// used to represent partial sequence // used to represent partial sequence
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include <atomic> #include <atomic>
#include <memory> #include <memory>
#include "paddle/gserver/gradientmachines/NeuralNetwork.h" #include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
#include "paddle/trainer/ParamUtil.h" #include "paddle/trainer/ParamUtil.h"
#include "paddle/trainer/Trainer.h" #include "paddle/trainer/Trainer.h"
#include "paddle/trainer/TrainerInternal.h" #include "paddle/trainer/TrainerInternal.h"
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "PaddleAPI.h" #include "PaddleAPI.h"
#include "paddle/parameter/Parameter.h" #include "paddle/legacy/parameter/Parameter.h"
#include "paddle/utils/Common.h" #include "paddle/utils/Common.h"
#include "paddle/utils/Flags.h" #include "paddle/utils/Flags.h"
#include "paddle/utils/PythonUtil.h" #include "paddle/utils/PythonUtil.h"
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "PaddleAPI.h" #include "PaddleAPI.h"
#include "paddle/math/Vector.h" #include "paddle/legacy/math/Vector.h"
#include <cstring> #include <cstring>
......
...@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "capi.h" #include "capi.h"
#include "paddle/gserver/gradientmachines/GradientMachine.h" #include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
#include "paddle/math/Matrix.h" #include "paddle/legacy/math/Matrix.h"
#include "paddle/math/Vector.h" #include "paddle/legacy/math/Vector.h"
#include "paddle/parameter/Argument.h" #include "paddle/legacy/parameter/Argument.h"
#pragma once #pragma once
namespace paddle { namespace paddle {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "gradient_machine.h" #include "gradient_machine.h"
#include "capi_private.h" #include "capi_private.h"
#include "paddle/gserver/gradientmachines/NeuralNetwork.h" #include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v) #define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <paddle/gserver/gradientmachines/GradientMachine.h> #include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
#include <paddle/trainer/TrainerConfigHelper.h> #include <paddle/trainer/TrainerConfigHelper.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
......
...@@ -46,6 +46,10 @@ cc_library(paddle_inference_api ...@@ -46,6 +46,10 @@ cc_library(paddle_inference_api
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_library(paddle_inference_api_shared SHARED
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS test_paddle_inference_api.cc SRCS test_paddle_inference_api.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
......
...@@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: ", in.type().name()); "Input tensor type is not supported: ", in.type().name());
memory::data_type out_type = in_type; memory::data_type out_type = in_type;
auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format()); auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
auto out_format = auto out_format =
MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
void* in_data = GetDataFromTensor(in, in_type); void* in_data = GetDataFromTensor(in, in_type);
......
...@@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { ...@@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
return MKLDNNDataType::data_undef; return MKLDNNDataType::data_undef;
} }
inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
MKLDNNFormat default_format) {
return (dims_size == 1
? mkldnn::memory::format::x
: dims_size == 2 ? mkldnn::memory::format::nc : default_format);
}
#endif #endif
void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
......
...@@ -18,17 +18,21 @@ limitations under the License. */ ...@@ -18,17 +18,21 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/data_type_transform.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static void PassTensorData(Tensor* from, Tensor* to) { static void PassTensorData(Tensor *from, Tensor *to) {
to->ShareDataWith(*from); to->ShareDataWith(*from);
*from = Tensor(); *from = Tensor();
} }
void DataTransform(const OpKernelType& expected_kernel_type, void TransformData(const OpKernelType &expected_kernel_type,
const OpKernelType& kernel_type_for_var, const OpKernelType &kernel_type_for_var,
const Tensor& input_tensor, Tensor* output_tensor) { const Tensor &input_tensor, Tensor *output_tensor) {
bool transformed = false; bool transformed = false;
Tensor in; Tensor in;
in.ShareDataWith(input_tensor); in.ShareDataWith(input_tensor);
...@@ -48,8 +52,8 @@ void DataTransform(const OpKernelType& expected_kernel_type, ...@@ -48,8 +52,8 @@ void DataTransform(const OpKernelType& expected_kernel_type,
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur // Just set layout/format. No real transform occur
auto out_format = auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin)); ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor); out.ShareDataWith(input_tensor);
out.set_layout(DataLayout::kMKLDNN); out.set_layout(DataLayout::kMKLDNN);
...@@ -89,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type, ...@@ -89,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type,
output_tensor->ShareDataWith(in); output_tensor->ShareDataWith(in);
} }
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
Variable* out_var) { Variable *out_var) {
if (in_var.IsType<LoDTensor>()) { if (in_var.IsType<LoDTensor>()) {
auto& in_lod_tensor = in_var.Get<LoDTensor>(); auto &in_lod_tensor = in_var.Get<LoDTensor>();
auto* tran_lod_tensor = out_var->GetMutable<LoDTensor>(); auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
tran_lod_tensor->set_lod(in_lod_tensor.lod()); tran_lod_tensor->set_lod(in_lod_tensor.lod());
tran_lod_tensor->set_layout(in_lod_tensor.layout()); tran_lod_tensor->set_layout(in_lod_tensor.layout());
tran_lod_tensor->ShareDataWith(tensor); tran_lod_tensor->ShareDataWith(tensor);
} else if (in_var.IsType<SelectedRows>()) { } else if (in_var.IsType<SelectedRows>()) {
auto& in_selected_rows = in_var.Get<SelectedRows>(); auto &in_selected_rows = in_var.Get<SelectedRows>();
auto* trans_selected_rows = out_var->GetMutable<SelectedRows>(); auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
trans_selected_rows->set_height(in_selected_rows.height()); trans_selected_rows->set_height(in_selected_rows.height());
trans_selected_rows->set_rows(in_selected_rows.rows()); trans_selected_rows->set_rows(in_selected_rows.rows());
trans_selected_rows->mutable_value()->ShareDataWith(tensor); trans_selected_rows->mutable_value()->ShareDataWith(tensor);
......
...@@ -30,12 +30,15 @@ limitations under the License. */ ...@@ -30,12 +30,15 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
void DataTransform(const OpKernelType& expected_kernel_type, void TransformData(const OpKernelType &expected_kernel_type,
const OpKernelType& kernel_type_for_var, const OpKernelType &kernel_type_for_var,
const Tensor& input_tensor, Tensor* out); const Tensor &input_tensor, Tensor *out);
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, /**
Variable* out_var); * Set OutVar from InVar, except the tensor is shared with `tensor`
*/
void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
Variable *out_var);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -25,11 +25,12 @@ else() ...@@ -25,11 +25,12 @@ else()
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
endif() endif()
cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
......
...@@ -33,6 +33,8 @@ struct BuildStrategy { ...@@ -33,6 +33,8 @@ struct BuildStrategy {
GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
std::string debug_graphviz_path_{""}; std::string debug_graphviz_path_{""};
bool enable_data_balance_{true};
}; };
} // namespace details } // namespace details
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/data_balance_op_handle.h"
#include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h"
namespace paddle {
namespace framework {
namespace details {
#ifdef PADDLE_WITH_CUDA
DataBalanceOpHandle::DataBalanceOpHandle(
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs)
: local_scopes_(local_scopes), places_(places) {
if (ctxs) {
for (auto &p : places_) {
this->dev_ctxes_[p] = ctxs->DevCtx(p);
}
}
}
#else
DataBalanceOpHandle::DataBalanceOpHandle(
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
#endif
std::string DataBalanceOpHandle::Name() const { return "data balance"; }
std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
const std::vector<int> &device_sizes) {
int device_num = device_sizes.size();
int total_size = 0;
int empty_num = 0;
std::vector<std::array<int, 2>> size_device_vec;
size_device_vec.reserve(device_num);
for (int i = 0; i < device_num; ++i) {
if (device_sizes[i] == 0) {
++empty_num;
}
total_size += device_sizes[i];
size_device_vec.push_back({{device_sizes[i], i}});
}
std::vector<std::array<int, 3>> res;
if (empty_num == 0) {
// No need to do data balance.
return res;
}
if (total_size < device_num) {
// No enough data.
PADDLE_THROW("There is no next data.");
}
std::sort(size_device_vec.begin(), size_device_vec.end(),
[](const std::array<int, 2> &a, const std::array<int, 2> &b) {
return a[0] > b[0];
});
int expected_device_size = total_size / device_num;
int src_idx = 0;
for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
if (size_device_vec[src_idx][0] <= expected_device_size) {
++src_idx;
PADDLE_ENFORCE_LT(
src_idx, device_num - empty_num,
"In current srategy an empty tensor should not be copy source.");
}
size_device_vec[src_idx][0] -= expected_device_size;
size_device_vec[dst_idx][0] += expected_device_size;
res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
expected_device_size}});
}
return res;
}
void DataBalanceOpHandle::RunImpl() {
if (places_.size() == 1) {
return;
}
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
int data_num = in_var_handles.size() / places_.size();
WaitInputVarGenerated();
std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
std::vector<int> device_sizes;
for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
"The name of input and output should be equal.");
int place_idx = i / data_num;
int data_idx = i % data_num;
auto *local_scope =
local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name_);
PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
auto *tensor = tensor_var->GetMutable<LoDTensor>();
lod_tensors[data_idx].push_back(tensor);
int ins_size =
tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
if (data_idx == 0) {
device_sizes.emplace_back(ins_size);
} else {
PADDLE_ENFORCE_EQ(
ins_size, device_sizes.at(place_idx),
"All data on the same device shall have the same batch size.");
}
}
const auto &balance_plan = GetBalancePlan(device_sizes);
for (const auto &trans : balance_plan) {
for (int data_idx = 0; data_idx < data_num; ++data_idx) {
LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
int trans_ins_size = trans[2];
LoD src_lod = src_tensor->lod();
int src_ins_size =
src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
int cut_point = src_ins_size - trans_ins_size;
if (!src_lod.empty()) {
for (auto &level : src_lod) {
cut_point = level[cut_point];
}
}
TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
dst_tensor->place(), dst_tensor);
src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
if (!src_lod.empty()) {
dst_tensor->set_lod(SliceInLevel(
src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
src_tensor->set_lod(
SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
}
}
}
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace framework {
namespace details {
struct DataBalanceOpHandle : public OpHandleBase {
public:
#ifdef PADDLE_WITH_CUDA
DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs);
#else
DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places);
#endif
std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; };
protected:
void RunImpl() override;
private:
// std::vector<(src_dev_id, dst_dev_id, trans_size)>
std::vector<std::array<int, 3>> GetBalancePlan(
const std::vector<int> &batch_size_per_device);
const std::vector<Scope *> local_scopes_;
const std::vector<platform::Place> places_;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -67,8 +67,8 @@ void FetchOpHandle::RunImpl() { ...@@ -67,8 +67,8 @@ void FetchOpHandle::RunImpl() {
#endif #endif
} else { } else {
tensors_[i].ShareDataWith(t); tensors_[i].ShareDataWith(t);
tensors_[i].set_lod(t.lod());
} }
tensors_[i].set_lod(t.lod());
} }
this->WaitAndMergeCPUTensors(); this->WaitAndMergeCPUTensors();
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/data_balance_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h"
...@@ -215,7 +216,14 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -215,7 +216,14 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
} else { } else {
// This op runs on all devices, and its output may have parameter's // This op runs on all devices, and its output may have parameter's
// gradients. // gradients.
CreateComputationalOps(&result, *op, places_.size()); if (op->Type() == "read" && strategy_.enable_data_balance_) {
op->SetAttr("throw_eof_exp", false);
CreateComputationalOps(&result, *op, places_.size());
const auto &data_var_names = op->Output("Out");
InsertDataBalanceOp(&result, data_var_names);
} else {
CreateComputationalOps(&result, *op, places_.size());
}
if (!is_forwarding && places_.size() > 1) { if (!is_forwarding && places_.size() > 1) {
// Currently, we assume that once gradient is generated, it can be // Currently, we assume that once gradient is generated, it can be
...@@ -360,6 +368,29 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result, ...@@ -360,6 +368,29 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
} }
} }
void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
SSAGraph *result, const std::vector<std::string> &datas) const {
#ifdef PADDLE_WITH_CUDA
result->ops_.emplace_back(
new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_));
#else
result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
#endif
auto *op_handle = result->ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
SetCommunicationContext(op_handle, p);
for (const std::string &d_name : datas) {
auto &vars = result->vars_[i][d_name];
PADDLE_ENFORCE(!vars.empty());
op_handle->AddInput(vars.back().get());
auto var = new VarHandle(vars.size(), i, d_name, p);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
}
}
bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
const std::string &og, const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const { std::unordered_set<std::string> *og_has_been_broadcast) const {
...@@ -512,7 +543,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, ...@@ -512,7 +543,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
// the variable name which contains .block means it was splited by // the variable name which contains .block means it was splited by
// split_byref op // split_byref op
// so that we can balance the variable blocks to all the pserver instances. // so that we can balance the variable blocks to all the pserver
// instances.
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce && if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
op.InputArgumentNames()[0].find(".block") == std::string::npos) { op.InputArgumentNames()[0].find(".block") == std::string::npos) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames()); op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
......
...@@ -101,6 +101,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -101,6 +101,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
void InsertAllReduceOp(SSAGraph *result, const std::string &og) const; void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
void InsertDataBalanceOp(SSAGraph *result,
const std::vector<std::string> &datas) const;
void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
size_t src_dev_id) const; size_t src_dev_id) const;
......
...@@ -58,8 +58,10 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -58,8 +58,10 @@ void OpHandleBase::Run(bool use_cuda) {
void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_NOT_NULL(waited_ctx);
if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) { if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
for (auto &dev_ctx : dev_ctxes_) { for (auto &dev_ctx : dev_ctxes_) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
dev_ctx.second->Wait(); dev_ctx.second->Wait();
} }
} else { } else {
......
...@@ -20,9 +20,7 @@ limitations under the License. */ ...@@ -20,9 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#ifdef PADDLE_WITH_DISTRIBUTE #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/grpc_client.h"
#endif
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -49,8 +47,7 @@ Executor::Executor(const platform::Place& place) : place_(place) {} ...@@ -49,8 +47,7 @@ Executor::Executor(const platform::Place& place) : place_(place) {}
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
void Executor::Complete() { void Executor::Complete() {
::paddle::operators::distributed::RPCClient::GetInstance< ::paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>()
::paddle::operators::distributed::GRPCClient>()
->SendComplete(); ->SendComplete();
} }
#endif #endif
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
...@@ -68,9 +69,9 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { ...@@ -68,9 +69,9 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
// only print first ten elements // only print first ten elements
int64_t size = t.numel() < 10 ? t.numel() : 10; int64_t size = t.numel() < 10 ? t.numel() : 10;
for (int64_t i = 0; i < size; ++i) { for (int64_t i = 0; i < size; ++i) {
if (t.type().hash_code() == typeid(float).hash_code()) { if (IsType<float>(t.type())) {
os << t.data<float>()[i] << " "; os << t.data<float>()[i] << " ";
} else if (t.type().hash_code() == typeid(int64_t).hash_code()) { } else if (IsType<int64_t>(t.type())) {
os << t.data<int64_t>()[i] << " "; os << t.data<int64_t>()[i] << " ";
} else { } else {
PADDLE_THROW("LoDTensor data type not in [float, int64_t]"); PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
...@@ -89,6 +90,7 @@ std::string LoDToString(const LoD &lod) { ...@@ -89,6 +90,7 @@ std::string LoDToString(const LoD &lod) {
LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
size_t elem_end) { size_t elem_end) {
PADDLE_ENFORCE_LT(level, in.size()); PADDLE_ENFORCE_LT(level, in.size());
PADDLE_ENFORCE_LT(elem_begin, elem_end);
PADDLE_ENFORCE_LT(elem_end, in[level].size()); PADDLE_ENFORCE_LT(elem_end, in[level].size());
LoD res; LoD res;
...@@ -384,7 +386,7 @@ void LoDTensor::MergeLoDTensor( ...@@ -384,7 +386,7 @@ void LoDTensor::MergeLoDTensor(
LoD new_lod = lod_tensors[0]->lod(); LoD new_lod = lod_tensors[0]->lod();
for (size_t i = 1; i < lod_tensors.size(); ++i) { for (size_t i = 1; i < lod_tensors.size(); ++i) {
auto *t = lod_tensors[i]; auto *t = lod_tensors[i];
PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code()); PADDLE_ENFORCE_EQ(new_type, t->type());
PADDLE_ENFORCE_EQ(new_layout, t->layout()); PADDLE_ENFORCE_EQ(new_layout, t->layout());
PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0], PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
...@@ -392,6 +394,7 @@ void LoDTensor::MergeLoDTensor( ...@@ -392,6 +394,7 @@ void LoDTensor::MergeLoDTensor(
new_dim[0] += t->dims()[0]; new_dim[0] += t->dims()[0];
auto &lod = t->lod(); auto &lod = t->lod();
PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
for (size_t j = 0; j < lod.size(); ++j) { for (size_t j = 0; j < lod.size(); ++j) {
auto &sub_lod = new_lod[j]; auto &sub_lod = new_lod[j];
auto &offset = sub_lod.back(); auto &offset = sub_lod.back();
......
...@@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { ...@@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
return ret; return ret;
} }
inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) {
return (!platform::places_are_same_class(l.place_, r.place_)) || return (!platform::places_are_same_class(l.place_, r.place_)) ||
(l.data_type_ != r.data_type_) || (l.data_type_ != r.data_type_) ||
NeedTransformLayout(l.data_layout_, r.data_layout_); NeedTransformLayout(l.data_layout_, r.data_layout_);
......
...@@ -76,6 +76,20 @@ class OpRegistry { ...@@ -76,6 +76,20 @@ class OpRegistry {
template <typename PlaceType, bool at_end, size_t I, typename... KernelType> template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
struct OpKernelRegistrarFunctor; struct OpKernelRegistrarFunctor;
template <typename PlaceType, typename T, typename Func>
inline void RegisterKernelClass(const char* op_type, const char* library_type,
Func func) {
std::string library(library_type);
std::string data_layout = "ANYLAYOUT";
if (library == "MKLDNN") {
data_layout = "MKLDNNLAYOUT";
}
OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
StringToDataLayout(data_layout),
StringToLibraryType(library_type));
OperatorWithKernel::AllOpKernels()[op_type][key] = func;
}
template <typename PlaceType, size_t I, typename... KernelTypes> template <typename PlaceType, size_t I, typename... KernelTypes>
struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> { struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
using KERNEL_TYPE = using KERNEL_TYPE =
...@@ -83,16 +97,10 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> { ...@@ -83,16 +97,10 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
void operator()(const char* op_type, const char* library_type) const { void operator()(const char* op_type, const char* library_type) const {
using T = typename KERNEL_TYPE::ELEMENT_TYPE; using T = typename KERNEL_TYPE::ELEMENT_TYPE;
std::string library(library_type); RegisterKernelClass<PlaceType, T>(
std::string data_layout = "ANYLAYOUT"; op_type, library_type, [](const framework::ExecutionContext& ctx) {
if (library == "MKLDNN") { KERNEL_TYPE().Compute(ctx);
data_layout = "MKLDNNLAYOUT"; });
}
OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
StringToDataLayout(data_layout),
StringToLibraryType(library_type));
OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value; constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...> OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
func; func;
...@@ -116,6 +124,47 @@ class OpKernelRegistrar : public Registrar { ...@@ -116,6 +124,47 @@ class OpKernelRegistrar : public Registrar {
} }
}; };
template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
struct OpKernelRegistrarFunctorEx;
template <typename PlaceType, typename... DataTypeAndKernelType>
class OpKernelRegistrarEx : public Registrar {
public:
explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) {
OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
func;
func(op_type, library_type);
}
};
template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
DataTypeAndKernelType...> {
void operator()(const char* op_type, const char* library_type) const {}
};
template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
DataTypeAndKernelType...> {
using Functor =
typename std::tuple_element<I + 1,
std::tuple<DataTypeAndKernelType...>>::type;
using T =
typename std::tuple_element<I,
std::tuple<DataTypeAndKernelType...>>::type;
void operator()(const char* op_type, const char* library_type) const {
RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor());
constexpr auto size =
std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
DataTypeAndKernelType...>
func;
func(op_type, library_type);
}
};
/** /**
* check if MACRO is used in GLOBAL NAMESPACE. * check if MACRO is used in GLOBAL NAMESPACE.
*/ */
...@@ -174,6 +223,25 @@ class OpKernelRegistrar : public Registrar { ...@@ -174,6 +223,25 @@ class OpKernelRegistrar : public Registrar {
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op_kernel_##op_type##_##library_type##__, \
"REGISTER_OP_KERNEL_EX must be called in global namespace"); \
static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \
__op_kernel_registrar_##op_type##_##library_type##__(#op_type, \
#library_type); \
int TouchOpKernelRegistrar_##op_type##_##library_type() { \
__op_kernel_registrar_##op_type##_##library_type##__.Touch(); \
return 0; \
}
#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \
__VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
/** /**
* Macro to mark what Operator and Kernel * Macro to mark what Operator and Kernel
* we will use and tell the compiler to * we will use and tell the compiler to
......
...@@ -592,8 +592,7 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -592,8 +592,7 @@ static void CheckTensorNANOrInf(const std::string& name,
if (tensor.memory_size() == 0) { if (tensor.memory_size() == 0) {
return; return;
} }
if (tensor.type().hash_code() != typeid(float).hash_code() && // NOLINT if (!IsType<float>(tensor.type()) && !IsType<double>(tensor.type())) {
tensor.type().hash_code() != typeid(double).hash_code()) { // NOLINT
return; return;
} }
PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
...@@ -620,8 +619,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -620,8 +619,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
"There are no kernels which are registered in the %s operator.", type_); "There are no kernels which are registered in the %s operator.", type_);
} }
ExecutionContext ctx(*this, scope, *dev_ctx);
OpKernelMap& kernels = kernels_iter->second; OpKernelMap& kernels = kernels_iter->second;
// TODO(dzhwinter) : kernel fallback mechanism will be added when all the // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
...@@ -631,7 +628,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -631,7 +628,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// Do selection // Do selection
// } // }
auto expected_kernel_key = this->GetExpectedKernelType(ctx); auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
...@@ -640,56 +638,34 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -640,56 +638,34 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
KernelTypeToString(expected_kernel_key)); KernelTypeToString(expected_kernel_key));
} }
// do data transform // do data transformScope &transfer_scope;
Scope& new_scope = scope.NewScope(); std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope =
TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
std::vector<std::string> inplace_vars; // exec scope is the scope that kernel actually executed on.
for (auto& var_name_item : this->Inputs()) { const Scope& exec_scope =
for (auto& var_name : var_name_item.second) { (transfer_scope == nullptr ? scope : *transfer_scope);
auto* var = scope.FindVar(var_name);
if (var && VarIsTensor(var)) { if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
auto* tensor_in = GetTensorFromVar(var); dev_ctx = pool.Get(expected_kernel_key.place_);
if (tensor_in->IsInitialized()) {
auto kernel_type_for_var = this->GetKernelTypeForVar(
var_name_item.first, *tensor_in, expected_kernel_key);
if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
auto out_var_names = OutputVars(true);
if (std::find(out_var_names.begin(), out_var_names.end(),
var_name) != out_var_names.end()) {
inplace_vars.push_back(var_name);
}
VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
auto* trans_var = new_scope.Var(var_name);
std::shared_ptr<Tensor> out(new Tensor);
DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
out.get());
CopyVariableWithTensor(*var, *(out.get()), trans_var);
}
}
}
}
} }
auto* new_dev_ctx = pool.Get(expected_kernel_key.place_); kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
kernel_iter->second->Compute(
ExecutionContext(*this, new_scope, *new_dev_ctx));
for (auto& var_name : inplace_vars) { if (!transfered_inplace_vars.empty()) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; // there is inplace variable has been transfered.
auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name)); TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
original_tensor->ShareDataWith(*transformed_tensor);
} }
/*For profiling/benchmark only*/ /*For profiling/benchmark only*/
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
new_dev_ctx->Wait(); dev_ctx->Wait();
} }
if (FLAGS_check_nan_inf) { if (FLAGS_check_nan_inf) {
for (auto& vname : OutputVars(true)) { for (auto& vname : OutputVars(true)) {
auto* var = new_scope.FindVar(vname); auto* var = exec_scope.FindVar(vname);
if (var == nullptr) continue; if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) { if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>()); CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
...@@ -697,6 +673,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -697,6 +673,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
} }
} }
} }
void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
auto* transformed_tensor =
GetTensorFromVar(transfer_scope.FindVar(var_name));
original_tensor->ShareDataWith(*transformed_tensor);
}
}
Scope* OperatorWithKernel::TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const {
Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) {
auto* var = scope.FindVar(var_name);
// Only tensor can be tranfer to another device.
if (var == nullptr || !VarIsTensor(var)) {
continue;
}
auto* tensor_in = GetTensorFromVar(var);
if (!tensor_in->IsInitialized()) {
continue;
}
auto kernel_type_for_var = GetKernelTypeForVar(
var_name_item.first, *tensor_in, expected_kernel_key);
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
continue;
}
auto out_var_names = OutputVars(true);
if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
out_var_names.end()) {
transfered_inplace_vars->emplace_back(var_name);
}
VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
if (new_scope == nullptr) {
new_scope = &scope.NewScope();
}
auto* trans_var = new_scope->Var(var_name);
Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
SetTensorToVariable(*var, out, trans_var);
}
}
return new_scope;
}
proto::VarType::Type OperatorWithKernel::IndicateDataType( proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const { const ExecutionContext& ctx) const {
...@@ -713,10 +747,6 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -713,10 +747,6 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
t = &var->Get<LoDTensor>(); t = &var->Get<LoDTensor>();
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
t = &(var->Get<SelectedRows>().value()); t = &(var->Get<SelectedRows>().value());
} else if (var->IsType<LoDTensorArray>()) {
const LoDTensorArray& arr = var->Get<LoDTensorArray>();
PADDLE_ENFORCE(arr.size() > 0);
t = &(arr[0]);
} }
if (t != nullptr) { if (t != nullptr) {
int tmp = static_cast<int>(ToDataType(t->type())); int tmp = static_cast<int>(ToDataType(t->type()));
......
...@@ -347,9 +347,9 @@ class OpKernel : public OpKernelBase { ...@@ -347,9 +347,9 @@ class OpKernel : public OpKernelBase {
class OperatorWithKernel : public OperatorBase { class OperatorWithKernel : public OperatorBase {
public: public:
using OpKernelFunc = std::function<void(const ExecutionContext&)>;
using OpKernelMap = using OpKernelMap =
std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>, std::unordered_map<OpKernelType, OpKernelFunc, OpKernelType::Hash>;
OpKernelType::Hash>;
OperatorWithKernel(const std::string& type, const VariableNameMap& inputs, OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs) const VariableNameMap& outputs, const AttributeMap& attrs)
...@@ -384,6 +384,20 @@ class OperatorWithKernel : public OperatorBase { ...@@ -384,6 +384,20 @@ class OperatorWithKernel : public OperatorBase {
// same. // same.
proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
void RunImpl(const Scope& scope, const platform::Place& place) const final; void RunImpl(const Scope& scope, const platform::Place& place) const final;
/**
* Transfer data from scope to a transfered scope. If there is no data need to
* be tranfered, it returns nullptr.
*
* * transfered_inplace_vars is a output vector.
*/
Scope* TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const;
void TransferInplaceVarsBack(const Scope& scope,
const std::vector<std::string>& inplace_vars,
const Scope& exec_scope) const;
}; };
extern bool OpSupportGPU(const std::string& op_type); extern bool OpSupportGPU(const std::string& op_type);
......
...@@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ...@@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
t->set_lod(lod_tensors[j].lod()); t->set_lod(lod_tensors[j].lod());
} }
} }
for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
} }
ParallelExecutor::~ParallelExecutor() { ParallelExecutor::~ParallelExecutor() {
......
...@@ -23,9 +23,9 @@ namespace framework { ...@@ -23,9 +23,9 @@ namespace framework {
template <typename T> template <typename T>
inline const T* Tensor::data() const { inline const T* Tensor::data() const {
check_memory_size(); check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value || bool valid = std::is_same<T, void>::value ||
holder_->type() == std::type_index(typeid(T)), holder_->type() == std::type_index(typeid(T));
"Tensor holds the wrong type, it holds %s", PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type().name()); this->holder_->type().name());
return reinterpret_cast<const T*>( return reinterpret_cast<const T*>(
...@@ -37,9 +37,9 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; } ...@@ -37,9 +37,9 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
template <typename T> template <typename T>
inline T* Tensor::data() { inline T* Tensor::data() {
check_memory_size(); check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value || bool valid = std::is_same<T, void>::value ||
holder_->type() == std::type_index(typeid(T)), holder_->type() == std::type_index(typeid(T));
"Tensor holds the wrong type, it holds %s", PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type().name()); this->holder_->type().name());
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
......
...@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); if (platform::is_same_place(src_place, dst_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
if (platform::is_same_place(ctx_place, src_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
} else if (platform::is_same_place(ctx_place, dst_place)) {
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
}
}
} }
#endif #endif
} }
...@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) { Tensor* dst) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx; const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(src.place())) { if (platform::is_gpu_place(dst_place)) {
dev_ctx = pool.Get(src.place());
} else {
dev_ctx = pool.Get(dst_place); dev_ctx = pool.Get(dst_place);
} else {
dev_ctx = pool.Get(src.place());
} }
TensorCopy(src, dst_place, *dev_ctx, dst); TensorCopy(src, dst_place, *dev_ctx, dst);
} }
......
...@@ -23,10 +23,25 @@ limitations under the License. */ ...@@ -23,10 +23,25 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
// and dst_place are two different GPU, to ensure that the operation can
// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
// If ctx_place and src_place are the same, src_ctx.Wait() is added
// after memory::Copy; if ctx_place and dst_place are the same,
// src_ctx.Wait() is added before memory::Copy.
void TensorCopy(const Tensor& src, const platform::Place& dst_place, void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst); const platform::DeviceContext& ctx, Tensor* dst);
// NOTE(zcd): If the src.place() and dst_place are two different GPU,
// the copy operation is carried out on the dst_place's stream. This is
// very important, because TensorCopy is an async operator, and in most
// case, once this copy operator returns, dst is to be used in dst_place's
// stream, if this copy operation is carried out on the src_place's stream,
// when dst is used in dst_place's stream the copy operation may be
// not completed.
void TensorCopy(const Tensor& src, const platform::Place& dst_place, void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst); Tensor* dst);
void TensorCopySync(const Tensor& src, const platform::Place& dst_place, void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst); Tensor* dst);
......
...@@ -24,18 +24,24 @@ limitations under the License. */ ...@@ -24,18 +24,24 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename T>
bool IsType(const std::type_index& type_index) {
return type_index == std::type_index(typeid(T));
}
inline proto::VarType::Type ToVarType(std::type_index type) { inline proto::VarType::Type ToVarType(std::type_index type) {
if (type.hash_code() == typeid(LoDTensor).hash_code()) { if (IsType<LoDTensor>(type)) {
return proto::VarType_Type_LOD_TENSOR; return proto::VarType_Type_LOD_TENSOR;
} else if (type.hash_code() == typeid(LoDRankTable).hash_code()) { } else if (IsType<LoDRankTable>(type)) {
return proto::VarType_Type_LOD_RANK_TABLE; return proto::VarType_Type_LOD_RANK_TABLE;
} else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) { } else if (IsType<LoDTensorArray>(type)) {
return proto::VarType_Type_LOD_TENSOR_ARRAY; return proto::VarType_Type_LOD_TENSOR_ARRAY;
} else if (type.hash_code() == typeid(SelectedRows).hash_code()) { } else if (IsType<SelectedRows>(type)) {
return proto::VarType_Type_SELECTED_ROWS; return proto::VarType_Type_SELECTED_ROWS;
} else if (type.hash_code() == typeid(ReaderHolder).hash_code()) { } else if (IsType<ReaderHolder>(type)) {
return proto::VarType_Type_READER; return proto::VarType_Type_READER;
} else if (type.hash_code() == typeid(ChannelHolder).hash_code()) { } else if (IsType<ChannelHolder>(type)) {
return proto::VarType_Type_CHANNEL; return proto::VarType_Type_CHANNEL;
} else { } else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
......
# Inference Analysis
The `inference/analysis` module is used to analyze and optimize the inference program,
it references some philosophy from `LLVM/analysis`,
and make the various optimization features be pluggable and co-exist in a pipeline.
We borrowed some concepts from LLVM, such as
- [Pass](./pass.h)es to implement optimization that traverse the inference program,
- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
There are some other basic concepts here
- [Node](./node.h), the node in a `DataFlowGraph`,
- `Function`, the Operator in Fluid,
- `Value`, the Variable in Fluid;
- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
## How it works
The `inference/analysis` module make all the passes in a pipeline, and works in such way:
1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
The new optimization features can be added as an independent `Pass` and controlled by gflags,
each pass will generate unified debug information or visualization for better debugging.
## Supported Passes
### `FluidToDataFlowGraphPass`
Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes,
this should be the first pass of the pipeline.
### `DataFlowGraphToFluidPass`
Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
### `TensorRTSubgraphNodeMarkPass`
Mark the `Node` that are supported by TensorRT,
this pass will generate a visualization file which can be used for debugging.
### `TensorRTSubGraphPass`
Split the sub-graph that are can be accelerated by TensorRT.
### `DFG_GraphvizDrawPass`
This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
It can be used as a helper class that draws the modified graph after each pass.
## Utilities
There is some helper legacy/function/class for analysis.
- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <string>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
...@@ -79,4 +80,4 @@ void Analyzer::Run(Argument* argument) { ...@@ -79,4 +80,4 @@ void Analyzer::Run(Argument* argument) {
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
\ No newline at end of file
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
/* /*
* This file contains Analyzer, an class that exposed as a library that analyze * This file contains Analyzer, an class that exposed as a library that analyze
* and optimize * and optimize
......
...@@ -138,7 +138,7 @@ struct GraphTraits<DataFlowGraph> { ...@@ -138,7 +138,7 @@ struct GraphTraits<DataFlowGraph> {
// sub-graph is the inputs nodes and output nodes that doesn't inside the // sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph. // sub-graph.
static std::pair<std::vector<Node *>, std::vector<Node *>> static std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::unordered_set<Node *> nodes(graph.begin(), graph.end()); std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs; std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs; std::unordered_set<Node *> outputs;
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include <vector>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/framework/proto_desc.h"
...@@ -150,13 +151,14 @@ namespace { ...@@ -150,13 +151,14 @@ namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass { class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public: public:
using Config = DFG_GraphvizDrawPass::Config; using Config = DFG_GraphvizDrawPass::Config;
DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {} explicit DFG_DebuggerPass(const Config& config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "dfg-to-fluid-debuger-pass"; } std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
bool Finalize() override { return true; } bool Finalize() override { return true; }
}; };
} } // namespace
Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
......
...@@ -46,7 +46,7 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass { ...@@ -46,7 +46,7 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
const bool display_deleted_node; const bool display_deleted_node;
}; };
DFG_GraphvizDrawPass(const Config &config) : config_(config) {} explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
bool Initialize(Argument *argument) override { return true; } bool Initialize(Argument *argument) override { return true; }
void Run(DataFlowGraph *graph) override; void Run(DataFlowGraph *graph) override;
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
...@@ -88,7 +88,8 @@ namespace { ...@@ -88,7 +88,8 @@ namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass { class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public: public:
using Config = DFG_GraphvizDrawPass::Config; using Config = DFG_GraphvizDrawPass::Config;
DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {} explicit DFG_DebuggerPass(const Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "fluid-to-dfg-debuger-pass"; } std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
bool Finalize() override { return true; } bool Finalize() override { return true; }
}; };
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <typeindex>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -41,7 +42,7 @@ int AccuDims(Vec &&vec, int size) { ...@@ -41,7 +42,7 @@ int AccuDims(Vec &&vec, int size) {
return res; return res;
} }
#define SET_TYPE(type__) dic_[typeid(type__).hash_code()] = #type__; #define SET_TYPE(type__) dic_[std::type_index(typeid(type__))] = #type__;
/* /*
* Map typeid to representation. * Map typeid to representation.
*/ */
...@@ -53,14 +54,14 @@ struct DataTypeNamer { ...@@ -53,14 +54,14 @@ struct DataTypeNamer {
template <typename T> template <typename T>
const std::string &repr() const { const std::string &repr() const {
auto x = typeid(T).hash_code(); auto x = std::type_index(typeid(T));
PADDLE_ENFORCE(dic_.count(x), "unknown type for representation"); PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
return dic_.at(x); return dic_.at(x);
} }
const std::string &repr(size_t &hash) const { // NOLINT const std::string &repr(const std::type_index &type) const { // NOLINT
PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation"); PADDLE_ENFORCE(dic_.count(type), "unknown type for representation");
return dic_.at(hash); return dic_.at(type);
} }
private: private:
...@@ -71,9 +72,7 @@ struct DataTypeNamer { ...@@ -71,9 +72,7 @@ struct DataTypeNamer {
SET_TYPE(void *); SET_TYPE(void *);
} }
std::unordered_map<decltype(typeid(int).hash_code()), // NOLINT std::unordered_map<std::type_index, std::string> dic_;
std::string>
dic_;
}; };
#undef SET_TYPE #undef SET_TYPE
......
...@@ -23,9 +23,9 @@ namespace analysis { ...@@ -23,9 +23,9 @@ namespace analysis {
template <> template <>
std::string &NodeAttr::As<std::string>() { std::string &NodeAttr::As<std::string>() {
if (data_.empty()) { if (data_.empty()) {
type_hash_ = typeid(std::string).hash_code(); type_index_ = std::type_index(typeid(std::string));
} }
PADDLE_ENFORCE_EQ(type_hash_, typeid(std::string).hash_code()); PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
return data_; return data_;
} }
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/inference/analysis/device.h" #include "paddle/fluid/inference/analysis/device.h"
#include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
...@@ -57,12 +58,12 @@ struct NodeAttr { ...@@ -57,12 +58,12 @@ struct NodeAttr {
// init storage in the first usage. // init storage in the first usage.
if (data_.empty()) { if (data_.empty()) {
VLOG(4) << "resize data to " << sizeof(T); VLOG(4) << "resize data to " << sizeof(T);
type_hash_ = typeid(T).hash_code(); type_index_ = std::type_index(typeid(T));
data_.resize(sizeof(T)); data_.resize(sizeof(T));
} }
PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(), PADDLE_ENFORCE(framework::IsType<T>(type_index_),
"type not matched, origin is %s, want %s", "type not matched, origin is %s, want %s",
DataTypeNamer::Global().repr(type_hash_), DataTypeNamer::Global().repr(type_index_),
DataTypeNamer::Global().repr<T>()); DataTypeNamer::Global().repr<T>());
PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error"); PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
return *reinterpret_cast<T *>(&data_[0]); return *reinterpret_cast<T *>(&data_[0]);
...@@ -70,7 +71,7 @@ struct NodeAttr { ...@@ -70,7 +71,7 @@ struct NodeAttr {
private: private:
std::string data_; std::string data_;
size_t type_hash_{std::numeric_limits<size_t>::max()}; std::type_index type_index_{typeid(NodeAttr)};
}; };
/* /*
......
...@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/analysis/pass_manager.h" #include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include <gtest/gtest.h>
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
......
...@@ -12,10 +12,12 @@ ...@@ -12,10 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" #include <string>
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/node_attr_flags.h" #include "paddle/fluid/inference/analysis/node_attr_flags.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -29,7 +31,7 @@ void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) { ...@@ -29,7 +31,7 @@ void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
class DfgDebuggerPass : public DFG_GraphvizDrawPass { class DfgDebuggerPass : public DFG_GraphvizDrawPass {
public: public:
DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config) explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
: DFG_GraphvizDrawPass(config) {} : DFG_GraphvizDrawPass(config) {}
std::string repr() const override { std::string repr() const override {
......
...@@ -16,6 +16,10 @@ ...@@ -16,6 +16,10 @@
* This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
* that supported by TensorRT engine. * that supported by TensorRT engine.
*/ */
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h" #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
...@@ -30,7 +34,8 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass { ...@@ -30,7 +34,8 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
public: public:
using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller; using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
TensorRTSubgraphNodeMarkPass(const teller_t& teller) : teller_(teller) {} explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
: teller_(teller) {}
bool Initialize(Argument* argument) override { return true; } bool Initialize(Argument* argument) override { return true; }
...@@ -38,8 +43,10 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass { ...@@ -38,8 +43,10 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
// sub-graph into TensorRT. // sub-graph into TensorRT.
void Run(DataFlowGraph* graph) override; void Run(DataFlowGraph* graph) override;
std::string repr() const { return "tensorrt-sub-subgraph-mark"; } std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
std::string description() const { return "tensorrt sub-graph mark pass"; } std::string description() const override {
return "tensorrt sub-graph mark pass";
}
Pass* CreateGraphvizDebugerPass() const override; Pass* CreateGraphvizDebugerPass() const override;
bool Finalize() override; bool Finalize() override;
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/inference/analysis/node.h" #include "paddle/fluid/inference/analysis/node.h"
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h" #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
...@@ -30,7 +31,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { ...@@ -30,7 +31,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
// Tell whether to transform a sub-graph into TensorRT. // Tell whether to transform a sub-graph into TensorRT.
using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller; using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller); explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
bool Initialize(Argument* argument) override { return true; } bool Initialize(Argument* argument) override { return true; }
...@@ -40,8 +41,8 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { ...@@ -40,8 +41,8 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
bool Finalize() override { return true; } bool Finalize() override { return true; }
std::string repr() const { return "tensorrt-sub-graph"; } std::string repr() const override { return "tensorrt-sub-graph"; }
std::string description() const { return "tensorrt sub graph pass"; } std::string description() const override { return "tensorrt sub graph pass"; }
private: private:
NodeInsideSubgraphTeller node_inside_subgraph_teller_; NodeInsideSubgraphTeller node_inside_subgraph_teller_;
...@@ -49,4 +50,4 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { ...@@ -49,4 +50,4 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // paddle } // namespace paddle
...@@ -20,6 +20,12 @@ limitations under the License. */ ...@@ -20,6 +20,12 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle { namespace paddle {
...@@ -41,6 +47,9 @@ template <> ...@@ -41,6 +47,9 @@ template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size); void* p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(10) << " pointer=" << p; VLOG(10) << " pointer=" << p;
return p; return p;
} }
...@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) { ...@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place); LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} }
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
return ptr; return ptr;
} }
...@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, ...@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
<< " bytes in CUDAPinnedPlace"; << " bytes in CUDAPinnedPlace";
} }
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr; return ptr;
} }
......
...@@ -184,6 +184,7 @@ else() ...@@ -184,6 +184,7 @@ else()
set(DEPS_OPS ${DEPS_OPS} nccl_op) set(DEPS_OPS ${DEPS_OPS} nccl_op)
endif() endif()
set(DISTRIBUTE_DEPS "")
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
add_subdirectory(distributed) add_subdirectory(distributed)
...@@ -192,6 +193,18 @@ if(WITH_DISTRIBUTE) ...@@ -192,6 +193,18 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
else() else()
set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib) set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
if(WITH_BRPC_RDMA)
find_library(IBVERBS_LIBRARY NAMES ibverbs)
ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
find_library(RDMACM_LIBRARY NAMES rdmacm)
ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
endif()
endif() endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
...@@ -205,7 +218,7 @@ if(WITH_DISTRIBUTE) ...@@ -205,7 +218,7 @@ if(WITH_DISTRIBUTE)
# listen_and_serv_op sum_op executor SERIAL) # listen_and_serv_op sum_op executor SERIAL)
if(WITH_GPU) if(WITH_GPU)
set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL) cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
if(WITH_GRPC) if(WITH_GRPC)
op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
else() else()
...@@ -297,6 +310,7 @@ foreach(src ${DETECTION_LIBRARY}) ...@@ -297,6 +310,7 @@ foreach(src ${DETECTION_LIBRARY})
endforeach() endforeach()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/argsort_op.h"
namespace paddle {
namespace operators {
class ArgsortOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ArgsortOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ArgsortOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Indices"),
"Output(Indices) of ArgsortOp should not be null.");
auto in_dims = ctx->GetInputDim("X");
int axis = ctx->Attrs().Get<int>("axis");
auto num_dims = in_dims.size();
PADDLE_ENFORCE(axis < num_dims,
"Attr(axis) %d of ArgsortOp is out of bounds for Input(X)'s "
"rank %d.",
axis, num_dims);
PADDLE_ENFORCE(axis >= -num_dims,
"Attr(axis) %d of ArgsortOp must be not less than "
"-rank(Input(X)) (%d).",
axis, num_dims);
ctx->SetOutputDim("Out", in_dims);
ctx->SetOutputDim("Indices", in_dims);
ctx->ShareLoD("X", "Out");
ctx->ShareLoD("X", "Indices");
}
};
class ArgsortOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) The input of Argsort op.");
AddOutput("Out",
"(Tensor) The sorted tensor of Argsort op, with the same "
"shape as Input(X).");
AddOutput("Indices",
"(Tensor) The indices of a tensor giving the sorted order, with "
"the same shape as Input(X).");
AddComment(R"DOC(
Argsort operator
Performs sorting on the input tensor along the given axis and outputs two
tensors, Output(Out) and Output(Indices). They reserve the same shape
with Input(X), and Output(Out) represents the sorted tensor while
Output(Indices) gives the sorted order along the given axis Attr(axis).
)DOC");
AddAttr<int>("axis",
"(int, default -1) The axis along which to sort the tensor. "
"When axis < 0, the actual axis will be the |axis|'th "
"counting backwards. Default -1, the last dimension.")
.SetDefault(-1);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(argsort,
ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
ops::ArgsortKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/argsort_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using platform::PADDLE_CUDA_NUM_THREADS;
const int kMaxRank = 9; // The max rank of a tensor allowed in Fluid
__global__ void ComputeTargetIdx(const int64_t* in_dims, int dims_size,
int axis, int64_t n, int64_t* trg_idx,
int64_t* med_ids) {
int64_t index = threadIdx.x + blockDim.x * blockIdx.x;
if (index < n) {
int64_t shape_out_axis[kMaxRank - 1] = {0};
int64_t dims_out_axis[kMaxRank - 1] = {0};
int64_t tmp = index;
int64_t pos_in_axis = 0;
int64_t i = dims_size - 2;
int64_t dim_axis = 0;
for (int64_t j = dims_size - 1; j >= 0; --j) {
int64_t dim = in_dims[j];
if (j != axis) {
shape_out_axis[i] = tmp % dim;
dims_out_axis[i] = dim;
i--;
} else {
dim_axis = dim;
pos_in_axis = tmp % dim_axis;
}
tmp /= dim;
}
int64_t group = (dims_size > 1) ? shape_out_axis[0] : 0;
for (int64_t j = 0; j < dims_size - 2; ++j) {
group = group * dims_out_axis[j + 1] + shape_out_axis[j + 1];
}
int64_t traget_idx = group * dim_axis + pos_in_axis;
trg_idx[index] = traget_idx;
med_ids[traget_idx] = pos_in_axis;
}
}
template <typename T>
__global__ void PermuteInData(const T* in, const int64_t* trg_idx, int64_t n,
T* med_out) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
if (index < n) {
med_out[trg_idx[index]] = in[index];
}
}
template <typename T>
__global__ void Sort(int64_t axis_dim, int64_t groups, T* med_out,
int64_t* med_ids) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
if (index < groups) {
thrust::sort_by_key(thrust::device, med_out + index * axis_dim,
med_out + axis_dim * (1 + index),
med_ids + index * axis_dim);
}
}
template <typename T>
__global__ void PermuteMediateData(const T* med_out, const int64_t* med_ids,
const int64_t* trg_idx, int64_t n, T* out,
int64_t* indices) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
if (index < n) {
out[index] = med_out[trg_idx[index]];
indices[index] = med_ids[trg_idx[index]];
}
}
template <typename T>
class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices");
int axis = ctx.Attr<int>("axis");
auto in_dims = input->dims();
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
const T* in_data = input->data<T>();
T* out_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
int64_t numel = input->numel();
int64_t groups = numel / in_dims[axis];
std::vector<int64_t> in_dims_vec = vectorize(in_dims);
thrust::device_vector<int64_t> in_dims_dev(in_dims_vec.begin(),
in_dims_vec.end());
int64_t* in_dims_data = thrust::raw_pointer_cast(in_dims_dev.data());
// Mediate tensor for sorting data and indices
Tensor mediate_output, mediate_indices;
T* med_out_data =
mediate_output.mutable_data<T>(input->dims(), ctx.GetPlace());
int64_t* med_ids_data =
mediate_indices.mutable_data<int64_t>(in_dims, ctx.GetPlace());
// Target index of each element along the given axis in the mediate tensors
Tensor trg_idx_t;
int64_t* trg_idx = trg_idx_t.mutable_data<int64_t>(in_dims, ctx.GetPlace());
auto stream = ctx.cuda_device_context().stream();
const int num_threads = PADDLE_CUDA_NUM_THREADS;
ComputeTargetIdx<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
in_dims_data, in_dims.size(), axis, numel, trg_idx, med_ids_data);
PermuteInData<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
in_data, trg_idx, numel, med_out_data);
Sort<<<(groups - 1) / num_threads + 1, num_threads, 0, stream>>>(
in_dims[axis], groups, med_out_data, med_ids_data);
PermuteMediateData<<<(numel - 1) / num_threads + 1, num_threads, 0,
stream>>>(med_out_data, med_ids_data, trg_idx, numel,
out_data, ids_data);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(argsort, paddle::operators::ArgsortOpCUDAKernel<float>,
paddle::operators::ArgsortOpCUDAKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ArgsortKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<framework::Tensor>("X");
auto* output = ctx.Output<framework::Tensor>("Out");
auto* indices = ctx.Output<framework::Tensor>("Indices");
int axis = ctx.Attr<int>("axis");
auto in_dims = input->dims();
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
const T* in_data = input->data<T>();
T* out_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
int64_t groups = input->numel() / in_dims[axis];
int64_t stride = (axis == in_dims.size() - 1)
? 1
: framework::product(framework::slice_ddim(
in_dims, axis + 1, in_dims.size()));
for (int64_t i = 0; i < groups; ++i) {
int64_t idx = i;
std::vector<int64_t> shape_vec(in_dims.size(), 0);
for (int64_t dim = in_dims.size() - 1; dim >= 0; --dim) {
if (dim != axis) {
shape_vec[dim] = idx % in_dims[dim];
idx /= in_dims[dim];
}
}
int64_t start_index = shape_vec[0];
for (int64_t dim = 0; dim < in_dims.size() - 1; ++dim) {
start_index = start_index * in_dims[dim + 1] + shape_vec[dim + 1];
}
std::vector<int64_t> org_index_vec(in_dims[axis], start_index);
for (int64_t j = 1; j < in_dims[axis]; ++j) {
org_index_vec[j] += j * stride;
}
std::sort(org_index_vec.begin(), org_index_vec.end(),
[in_data](const int64_t v1, const int64_t v2) {
return in_data[v1] < in_data[v2];
});
for (size_t j = 0; j < org_index_vec.size(); ++j) {
int64_t index = start_index + j * stride;
out_data[index] = in_data[org_index_vec[j]];
ids_data[index] = (org_index_vec[j] - start_index) / stride;
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -115,9 +115,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -115,9 +115,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
// create mkldnn memory from input x tensor // create mkldnn memory from input x tensor
auto src_memory = mkldnn::memory::format input_format =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, platform::MKLDNNFormatForSize(src_tz.size(), x->format());
to_void_cast(x_data));
auto src_memory = memory(
{{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
to_void_cast(x_data));
// create primitive descriptor for batch norm forward // create primitive descriptor for batch norm forward
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>; using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
...@@ -251,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -251,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>; using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
// create mkldnn memory from input diff_y tensor // create mkldnn memory from input diff_y tensor
auto user_diff_dst_memory =
memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()}, mkldnn::memory::format dst_format =
mkldnn_engine}, platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
to_void_cast(diff_y_data));
auto user_diff_dst_memory = memory(
{{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
to_void_cast(diff_y_data));
// create mkldnn memory from input x tensor // create mkldnn memory from input x tensor
auto src_memory = mkldnn::memory::format input_format =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, platform::MKLDNNFormatForSize(src_tz.size(), x->format());
to_void_cast(x_data));
auto src_memory = memory(
{{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
to_void_cast(x_data));
// for diff_dst, try to use same format as dst in forward pass // for diff_dst, try to use same format as dst in forward pass
auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -47,7 +48,7 @@ class ConditionalOp : public framework::OperatorBase { ...@@ -47,7 +48,7 @@ class ConditionalOp : public framework::OperatorBase {
if (!(ips.size() == 1UL && ips[0]->IsInitialized())) { if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
PADDLE_THROW("should have one initialized input as condition"); PADDLE_THROW("should have one initialized input as condition");
} }
if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() && // NOLINT if (!(framework::IsType<bool>(ips[0]->type()) && // NOLINT
ips[0]->numel() == 1)) { ips[0]->numel() == 1)) {
PADDLE_THROW( PADDLE_THROW(
"condition input's data type should be bool, " "condition input's data type should be bool, "
......
...@@ -14,14 +14,22 @@ ...@@ -14,14 +14,22 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_DISTRIBUTE
#ifdef PADDLE_WITH_GRPC #ifdef PADDLE_WITH_GRPC
#include "paddle/fluid/operators/distributed/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc_server.h" #include "paddle/fluid/operators/distributed/grpc_server.h"
#define RPCSERVER_T distributed::AsyncGRPCServer #define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
#define RPCCLIENT_T distributed::GRPCClient #define RPCCLIENT_T paddle::operators::distributed::GRPCClient
#else
#else // PADDLE_WITH_GRPC
#include "paddle/fluid/operators/distributed/brpc_client.h" #include "paddle/fluid/operators/distributed/brpc_client.h"
#include "paddle/fluid/operators/distributed/brpc_server.h" #include "paddle/fluid/operators/distributed/brpc_server.h"
#define RPCSERVER_T distributed::AsyncBRPCServer #define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
#define RPCCLIENT_T distributed::BRPCClient #define RPCCLIENT_T paddle::operators::distributed::BRPCClient
#endif
#endif // PADDLE_WITH_GRPC
#endif // PADDLE_WITH_DISTRIBUTE
...@@ -22,6 +22,8 @@ iou_similarity_op.cu) ...@@ -22,6 +22,8 @@ iou_similarity_op.cu)
detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc) detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
detection_library(anchor_generator_op SRCS anchor_generator_op.cc
anchor_generator_op.cu)
detection_library(target_assign_op SRCS target_assign_op.cc detection_library(target_assign_op SRCS target_assign_op.cc
target_assign_op.cu) target_assign_op.cu)
detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/anchor_generator_op.h"
namespace paddle {
namespace operators {
class AnchorGeneratorOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of AnchorGeneratorOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Anchors"),
"Output(Anchors) of AnchorGeneratorOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Variances"),
"Output(Variances) of AnchorGeneratorOp should not be null.");
auto input_dims = ctx->GetInputDim("Input");
PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
auto anchor_sizes = ctx->Attrs().Get<std::vector<float>>("anchor_sizes");
auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
auto stride = ctx->Attrs().Get<std::vector<float>>("stride");
auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
size_t num_anchors = aspect_ratios.size() * anchor_sizes.size();
std::vector<int64_t> dim_vec(4);
dim_vec[0] = input_dims[2];
dim_vec[1] = input_dims[3];
dim_vec[2] = num_anchors;
dim_vec[3] = 4;
ctx->SetOutputDim("Anchors", framework::make_ddim(dim_vec));
ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
ctx.device_context());
}
};
class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input",
"(Tensor, default Tensor<float>), "
"the input feature is a tensor with a rank of 4. "
"The layout is NCHW.");
AddOutput("Anchors",
"(Tensor, default Tensor<float>), the output is a "
"tensor with a rank of 4. The layout is [H, W, num_anchors, 4]. "
"H is the height of input, W is the width of input, num_anchors "
"is the box count of each position. "
"Each anchor is in (xmin, ymin, xmax, ymax) format");
AddOutput("Variances",
"(Tensor, default Tensor<float>), the expanded variances for "
"normalizing bbox regression targets. The layout is [H, W, "
"num_anchors, 4]. "
"H is the height of input, W is the width of input, num_anchors "
"is the box count of each position. "
"Each variance is in (xcenter, ycenter, w, h) format");
AddAttr<std::vector<float>>(
"anchor_sizes",
"(vector<float>) List of Region Proposal Network(RPN) anchor sizes "
" given in absolute pixels e.g. (64, 128, 256, 512)."
" For instance, the anchor size of 64 means the area of this anchor "
"equals to 64**2.")
.AddCustomChecker([](const std::vector<float>& anchor_sizes) {
PADDLE_ENFORCE_GT(anchor_sizes.size(), 0,
"Size of anchor_sizes must be at least 1.");
for (size_t i = 0; i < anchor_sizes.size(); ++i) {
PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0,
"anchor_sizes[%d] must be positive.", i);
}
});
AddAttr<std::vector<float>>(
"aspect_ratios",
"(vector<float>) List of Region Proposal Network(RPN) anchor aspect "
"ratios, e.g. (0.5, 1, 2)."
"For instacne, the aspect ratio of 0.5 means the height / width of "
"this anchor equals 0.5.");
AddAttr<std::vector<float>>("variances",
"(vector<float>) List of variances to be used "
"in box regression deltas")
.AddCustomChecker([](const std::vector<float>& variances) {
PADDLE_ENFORCE_EQ(variances.size(), 4,
"Must and only provide 4 variance.");
for (size_t i = 0; i < variances.size(); ++i) {
PADDLE_ENFORCE_GT(variances[i], 0.0,
"variance[%d] must be greater than 0.", i);
}
});
AddAttr<std::vector<float>>("stride",
"Anchors stride across width and height, "
"with a default of (16, 16)")
.SetDefault(std::vector<float>(2, 16.0))
.AddCustomChecker([](const std::vector<float>& stride) {
PADDLE_ENFORCE_EQ(
stride.size(), 2,
"Must and only provide 2 stride for width and height.");
for (size_t i = 0; i < stride.size(); ++i) {
PADDLE_ENFORCE_GT(stride[i], 0.0,
"stride[%d] should be larger than 0.", i);
}
});
AddAttr<float>("offset",
"(float) "
"Anchor center offset, with a default of 0.5")
.SetDefault(0.5);
AddComment(R"DOC(
AnchorGenerator operator
Generates anchors for Faster RCNN, FPN etc. algorithm.
Each position of the input produce N anchors, N =
size(anchor_sizes) * size(aspect_ratios).
Please get more information from the following papers:
https://arxiv.org/abs/1506.01497.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(anchor_generator, ops::AnchorGeneratorOp,
ops::AnchorGeneratorOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(anchor_generator, ops::AnchorGeneratorOpKernel<float>,
ops::AnchorGeneratorOpKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/anchor_generator_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
const T* anchor_sizes, const int as_num,
const T* stride, const int sd_num, const int height,
const int width, const T offset) {
int num_anchors = as_num * ar_num;
int box_num = height * width * num_anchors;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
i += blockDim.x * gridDim.x) {
int h_idx = i / (num_anchors * width);
int w_idx = (i / num_anchors) % width;
T stride_width = stride[0];
T stride_height = stride[1];
T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
T area, area_ratios;
T base_w, base_h;
T scale_w, scale_h;
T anchor_width, anchor_height;
int anch_idx = i % num_anchors;
int ar_idx = anch_idx / as_num;
int as_idx = anch_idx % as_num;
T aspect_ratio = aspect_ratios[ar_idx];
T anchor_size = anchor_sizes[as_idx];
area = stride_width * stride_height;
area_ratios = area / aspect_ratio;
base_w = round(sqrt(area_ratios));
base_h = round(base_w * aspect_ratio);
scale_w = anchor_size / stride_width;
scale_h = anchor_size / stride_height;
anchor_width = scale_w * base_w;
anchor_height = scale_h * base_h;
T xmin = (x_ctr - 0.5 * (anchor_width - 1));
T ymin = (y_ctr - 0.5 * (anchor_height - 1));
T xmax = (x_ctr + 0.5 * (anchor_width - 1));
T ymax = (y_ctr + 0.5 * (anchor_height - 1));
out[i * 4] = xmin;
out[i * 4 + 1] = ymin;
out[i * 4 + 2] = xmax;
out[i * 4 + 3] = ymax;
}
}
template <typename T>
__global__ void SetVariance(T* out, const T* var, const int vnum,
const int num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
out[i] = var[i % vnum];
}
}
template <typename T>
class AnchorGeneratorOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<paddle::framework::Tensor>("Input");
auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
auto stride = ctx.Attr<std::vector<float>>("stride");
auto variances = ctx.Attr<std::vector<float>>("variances");
T offset = static_cast<T>(ctx.Attr<float>("offset"));
auto width = input->dims()[3];
auto height = input->dims()[2];
int num_anchors = aspect_ratios.size() * anchor_sizes.size();
int box_num = width * height * num_anchors;
int block = 512;
int grid = (box_num + block - 1) / block;
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
anchors->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace());
framework::Tensor ar;
framework::TensorFromVector(aspect_ratios, ctx.device_context(), &ar);
framework::Tensor as;
framework::TensorFromVector(anchor_sizes, ctx.device_context(), &as);
framework::Tensor sd;
framework::TensorFromVector(stride, ctx.device_context(), &sd);
GenAnchors<T><<<grid, block, 0, stream>>>(
anchors->data<T>(), ar.data<T>(), aspect_ratios.size(), as.data<T>(),
anchor_sizes.size(), sd.data<T>(), stride.size(), height, width,
offset);
framework::Tensor v;
framework::TensorFromVector(variances, ctx.device_context(), &v);
grid = (box_num * 4 + block - 1) / block;
SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
variances.size(), box_num * 4);
}
}; // namespace operators
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(anchor_generator,
ops::AnchorGeneratorOpCUDAKernel<float>,
ops::AnchorGeneratorOpCUDAKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
template <typename T>
class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<paddle::framework::Tensor>("Input");
auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
auto stride = ctx.Attr<std::vector<float>>("stride");
auto variances = ctx.Attr<std::vector<float>>("variances");
T offset = static_cast<T>(ctx.Attr<float>("offset"));
auto feature_width = input->dims()[3];
auto feature_height = input->dims()[2];
T stride_width, stride_height;
stride_width = stride[0];
stride_height = stride[1];
int num_anchors = aspect_ratios.size() * anchor_sizes.size();
anchors->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace());
auto e_anchors = framework::EigenTensor<T, 4>::From(*anchors);
for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
T area, area_ratios;
T base_w, base_h;
T scale_w, scale_h;
T anchor_width, anchor_height;
int idx = 0;
for (size_t r = 0; r < aspect_ratios.size(); ++r) {
auto ar = aspect_ratios[r];
for (size_t s = 0; s < anchor_sizes.size(); ++s) {
auto anchor_size = anchor_sizes[s];
area = stride_width * stride_height;
area_ratios = area / ar;
base_w = round(sqrt(area_ratios));
base_h = round(base_w * ar);
scale_w = anchor_size / stride_width;
scale_h = anchor_size / stride_height;
anchor_width = scale_w * base_w;
anchor_height = scale_h * base_h;
e_anchors(h_idx, w_idx, idx, 0) =
(x_ctr - 0.5 * (anchor_width - 1));
e_anchors(h_idx, w_idx, idx, 1) =
(y_ctr - 0.5 * (anchor_height - 1));
e_anchors(h_idx, w_idx, idx, 2) =
(x_ctr + 0.5 * (anchor_width - 1));
e_anchors(h_idx, w_idx, idx, 3) =
(y_ctr + 0.5 * (anchor_height - 1));
idx++;
}
}
}
}
framework::Tensor var_t;
var_t.mutable_data<T>(
framework::make_ddim({1, static_cast<int>(variances.size())}),
ctx.GetPlace());
auto var_et = framework::EigenTensor<T, 2>::From(var_t);
for (size_t i = 0; i < variances.size(); ++i) {
var_et(0, i) = variances[i];
}
int anchor_num = feature_height * feature_width * num_anchors;
auto var_dim = vars->dims();
vars->Resize({anchor_num, static_cast<int>(variances.size())});
auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(anchor_num, 1));
vars->Resize(var_dim);
}
}; // namespace operators
} // namespace operators
} // namespace paddle
...@@ -115,6 +115,7 @@ class MKLDNNMemory { ...@@ -115,6 +115,7 @@ class MKLDNNMemory {
template <typename T> template <typename T>
class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override { void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
......
...@@ -26,12 +26,8 @@ class FillZerosLikeOp : public framework::OperatorWithKernel { ...@@ -26,12 +26,8 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
"Input(X) of FillZerosLikeOp should not be null."); "Input(X) of FillZerosLikeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillZerosLikeOp should not be null."); "Output(Out) of FillZerosLikeOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
if (ctx->IsRuntime() && ctx->ShareLoD("X", /*->*/ "Out");
ctx->GetOutputsVarType("Out")[0] ==
framework::proto::VarType::LOD_TENSOR_ARRAY) {
return; // skip runtime infershape when is tensor array;
}
} }
}; };
...@@ -43,7 +39,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -43,7 +39,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
FillZerosLike Operator. FillZerosLike Operator.
Fill up a variable with zeros, supporting both LoDTensor and LoDTensorArray. Fill up a variable with zeros.
The output will have the same size as the input. The output will have the same size as the input.
)DOC"); )DOC");
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -24,29 +23,12 @@ template <typename DeviceContext, typename T> ...@@ -24,29 +23,12 @@ template <typename DeviceContext, typename T>
class FillZerosLikeKernel : public framework::OpKernel<T> { class FillZerosLikeKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto var = context.InputVar("X"); auto* out = context.Output<framework::Tensor>("Out");
if (var->IsType<framework::LoDTensor>()) { out->mutable_data<T>(context.GetPlace());
auto& input = *context.Input<framework::LoDTensor>("X");
auto& output = *context.Output<framework::LoDTensor>("Out"); math::SetConstant<DeviceContext, T> setter;
output.Resize(input.dims()); setter(context.template device_context<DeviceContext>(), out,
output.set_lod(input.lod()); static_cast<T>(0));
output.mutable_data<T>(context.GetPlace());
math::SetConstant<DeviceContext, T> setter;
setter(context.template device_context<DeviceContext>(), &(output),
static_cast<T>(0));
} else if (var->IsType<framework::LoDTensorArray>()) {
auto& input = *context.Input<framework::LoDTensorArray>("X");
auto& output = *context.Output<framework::LoDTensorArray>("Out");
output.resize(input.size());
for (auto i = 0; i < input.size(); i++) {
output[i].Resize(input[i].dims());
output[i].set_lod(input[i].lod());
output[i].mutable_data<T>(context.GetPlace());
math::SetConstant<DeviceContext, T> setter;
setter(context.template device_context<DeviceContext>(), &(output[i]),
static_cast<T>(0));
}
}
} }
}; };
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include <immintrin.h> #include <immintrin.h>
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
// TODO(qingqing) refine this dependence // TODO(qingqing) refine this dependence
#include "paddle/cuda/src/avx_mathfun.h" #include "paddle/legacy/cuda/src/avx_mathfun.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <ctime> #include <ctime>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
namespace paddle { namespace paddle {
...@@ -62,7 +63,7 @@ struct Formater { ...@@ -62,7 +63,7 @@ struct Formater {
} }
} }
void PrintDtype() { void PrintDtype() {
if (dtype.hash_code() != typeid(const char).hash_code()) { if (!framework::IsType<const char>(dtype)) {
CLOG << "\tdtype: " << dtype.name() << std::endl; CLOG << "\tdtype: " << dtype.name() << std::endl;
} }
} }
...@@ -83,15 +84,15 @@ struct Formater { ...@@ -83,15 +84,15 @@ struct Formater {
void PrintData(size_t size) { void PrintData(size_t size) {
PADDLE_ENFORCE_NOT_NULL(data); PADDLE_ENFORCE_NOT_NULL(data);
// print float // print float
if (dtype.hash_code() == typeid(const float).hash_code()) { if (framework::IsType<const float>(dtype)) {
Display<float>(size); Display<float>(size);
} else if (dtype.hash_code() == typeid(const double).hash_code()) { } else if (framework::IsType<const double>(dtype)) {
Display<double>(size); Display<double>(size);
} else if (dtype.hash_code() == typeid(const int).hash_code()) { } else if (framework::IsType<const int>(dtype)) {
Display<int>(size); Display<int>(size);
} else if (dtype.hash_code() == typeid(const int64_t).hash_code()) { } else if (framework::IsType<const int64_t>(dtype)) {
Display<int64_t>(size); Display<int64_t>(size);
} else if (dtype.hash_code() == typeid(const bool).hash_code()) { } else if (framework::IsType<const bool>(dtype)) {
Display<bool>(size); Display<bool>(size);
} else { } else {
CLOG << "\tdata: unprintable type: " << dtype.name() << std::endl; CLOG << "\tdata: unprintable type: " << dtype.name() << std::endl;
......
...@@ -66,9 +66,19 @@ class ReadOp : public framework::OperatorBase { ...@@ -66,9 +66,19 @@ class ReadOp : public framework::OperatorBase {
std::vector<std::string> out_arg_names = Outputs("Out"); std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins); reader->ReadNext(&ins);
PADDLE_ENFORCE(!ins.empty(), "There is no next data."); if (ins.empty()) {
if (Attr<bool>("throw_eof_exp")) {
PADDLE_THROW("There is no next data.");
} else {
ins.resize(out_arg_names.size());
for (auto& tensor : ins) {
// data type is not important for subsequent DataBalanceOpHandle
tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
}
}
}
PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < out_arg_names.size(); ++i) {
auto* out = auto* out =
scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>(); scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
out->ShareDataWith(ins[i]); out->ShareDataWith(ins[i]);
...@@ -82,6 +92,10 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -82,6 +92,10 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddInput("Reader", "(ReaderHolder) The executed reader."); AddInput("Reader", "(ReaderHolder) The executed reader.");
AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable(); AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
AddAttr<bool>("throw_eof_exp",
"If set true, an exception will be thrown when the Reader "
"yields empty (which means there is no next data).")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
Read Operator Read Operator
......
...@@ -12,14 +12,108 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,108 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/reshape_op.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class ReshapeOp : public framework::OperatorWithKernel {
public:
ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ReshapeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ReshapeOp should not be null.");
const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
PADDLE_ENFORCE(!shape.empty(),
"The shape information must be set by Attr(shape).");
if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
// If true, set the shape of Output(Out) according to Input(Shape) in
// ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
ctx->ShareLoD("X", /*->*/ "Out");
return;
}
auto x_dims = ctx->GetInputDim("X");
auto out_dims = ValidateShape(shape, x_dims);
ctx->SetOutputDim("Out", out_dims);
if (x_dims[0] == out_dims[0]) {
// Only pass LoD when the first dimension of output and Input(X)
// are the same.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
static framework::DDim ValidateShape(const std::vector<int> shape,
const framework::DDim &in_dims) {
const int64_t in_size = framework::product(in_dims);
// only one dimension can be set to -1, whose size will be automatically
// infered.
const int64_t unk_dim_val = -1;
const int64_t copy_dim_val = 0;
std::vector<int64_t> output_shape(shape.size(), 0);
int64_t capacity = 1;
int unk_dim_idx = -1;
for (size_t i = 0; i < shape.size(); ++i) {
if (shape[i] == unk_dim_val) {
PADDLE_ENFORCE(
unk_dim_idx == -1,
"Only one input dimension of Attr(shape) can be unknown.");
unk_dim_idx = i;
} else if (shape[i] == copy_dim_val) {
PADDLE_ENFORCE(
static_cast<int>(i) < in_dims.size(),
"The index of dimension to copy from input shape must be less "
"than the size of input shape.");
} else {
PADDLE_ENFORCE(
shape[i] > 0,
"Each input dimension of Attr(shape) must not be negtive except "
"one unknown dimension.");
}
capacity *= (shape[i] ? shape[i] : in_dims[i]);
output_shape[i] =
(shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
}
if (unk_dim_idx != -1) {
if (in_size > 0) {
// in_size < 0 and is un-determinate in compile time, skip the check,
// for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
// capacity = -24, in_size = -8, output_shape[0] = 0
// the following check will fail.
output_shape[unk_dim_idx] = -in_size / capacity;
PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
"Invalid shape is given.");
} else {
output_shape[unk_dim_idx] = -1;
}
} else {
PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
}
return framework::make_ddim(output_shape);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
};
class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -107,19 +201,93 @@ class ReshapeGradOp : public framework::OperatorWithKernel { ...@@ -107,19 +201,93 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
} }
}; };
class ReshapeKernel {
public:
void operator()(const framework::ExecutionContext &ctx) const {
auto *out = ctx.Output<framework::LoDTensor>("Out");
auto *in = ctx.Input<framework::LoDTensor>("X");
auto *shape_tensor = ctx.HasInput("Shape")
? ctx.Input<framework::LoDTensor>("Shape")
: nullptr;
framework::DDim out_dims = out->dims();
if (shape_tensor) {
auto *shape_data = shape_tensor->data<int>();
framework::Tensor cpu_shape_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>();
}
auto shape =
std::vector<int>(shape_data, shape_data + shape_tensor->numel());
out_dims = ReshapeOp::ValidateShape(shape, in->dims());
}
if (!in->lod().empty()) {
PADDLE_ENFORCE_EQ(
out_dims[0], in->dims()[0],
"Reshape operator cannot reshape an input sequence batch "
"into an output sequence batch that has a different "
"number of time steps. Please consider using "
"sequence_reshape op.");
}
bool inplace = ctx.Attr<bool>("inplace");
out->Resize(out_dims);
if (!inplace) {
out->mutable_data(ctx.GetPlace(), in->type());
framework::TensorCopySync(*in, ctx.GetPlace(), out);
out->Resize(out_dims);
} else {
out->ShareDataWith(*in);
out->Resize(out_dims);
}
}
};
class ReshapeGradKernel {
public:
void operator()(const framework::ExecutionContext &ctx) const {
auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
d_x->mutable_data(ctx.GetPlace(), d_out->type());
bool inplace = ctx.Attr<bool>("inplace");
auto in_dims = d_x->dims();
if (!inplace) {
framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
ctx.device_context().Wait();
d_x->Resize(in_dims);
} else {
d_x->ShareDataWith(*d_out);
d_x->Resize(in_dims);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp); REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>, REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel<CPU, double>, ops::ReshapeKernel, int, ops::ReshapeKernel,
ops::ReshapeKernel<CPU, int>, int64_t, ops::ReshapeKernel);
ops::ReshapeKernel<CPU, int64_t>); REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
REGISTER_OP_CPU_KERNEL(reshape_grad, ops::ReshapeGradKernel<CPU, float>, double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel<CPU, double>, ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel<CPU, int>, ops::ReshapeGradKernel);
ops::ReshapeGradKernel<CPU, int64_t>);
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel);
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reshape_op.h"
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(reshape, paddle::operators::ReshapeKernel<CUDA, float>,
paddle::operators::ReshapeKernel<CUDA, double>,
paddle::operators::ReshapeKernel<CUDA, int>,
paddle::operators::ReshapeKernel<CUDA, int64_t>);
REGISTER_OP_CUDA_KERNEL(reshape_grad,
paddle::operators::ReshapeGradKernel<CUDA, float>,
paddle::operators::ReshapeGradKernel<CUDA, double>,
paddle::operators::ReshapeGradKernel<CUDA, int>,
paddle::operators::ReshapeGradKernel<CUDA, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class ReshapeOp : public framework::OperatorWithKernel {
public:
ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ReshapeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ReshapeOp should not be null.");
const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
PADDLE_ENFORCE(!shape.empty(),
"The shape information must be set by Attr(shape).");
if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
// If true, set the shape of Output(Out) according to Input(Shape) in
// ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
ctx->ShareLoD("X", /*->*/ "Out");
return;
}
auto x_dims = ctx->GetInputDim("X");
auto out_dims = ValidateShape(shape, x_dims);
ctx->SetOutputDim("Out", out_dims);
if (x_dims[0] == out_dims[0]) {
// Only pass LoD when the first dimension of output and Input(X)
// are the same.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
static framework::DDim ValidateShape(const std::vector<int> shape,
const framework::DDim &in_dims) {
const int64_t in_size = framework::product(in_dims);
// only one dimension can be set to -1, whose size will be automatically
// infered.
const int64_t unk_dim_val = -1;
const int64_t copy_dim_val = 0;
std::vector<int64_t> output_shape(shape.size(), 0);
int64_t capacity = 1;
int unk_dim_idx = -1;
for (size_t i = 0; i < shape.size(); ++i) {
if (shape[i] == unk_dim_val) {
PADDLE_ENFORCE(
unk_dim_idx == -1,
"Only one input dimension of Attr(shape) can be unknown.");
unk_dim_idx = i;
} else if (shape[i] == copy_dim_val) {
PADDLE_ENFORCE(
static_cast<int>(i) < in_dims.size(),
"The index of dimension to copy from input shape must be less "
"than the size of input shape.");
} else {
PADDLE_ENFORCE(
shape[i] > 0,
"Each input dimension of Attr(shape) must not be negtive except "
"one unknown dimension.");
}
capacity *= (shape[i] ? shape[i] : in_dims[i]);
output_shape[i] =
(shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
}
if (unk_dim_idx != -1) {
if (in_size > 0) {
// in_size < 0 and is un-determinate in compile time, skip the check,
// for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
// capacity = -24, in_size = -8, output_shape[0] = 0
// the following check will fail.
output_shape[unk_dim_idx] = -in_size / capacity;
PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
"Invalid shape is given.");
} else {
output_shape[unk_dim_idx] = -1;
}
} else {
PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
}
return framework::make_ddim(output_shape);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
};
template <typename DeviceContext, typename T>
class ReshapeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const {
auto *out = ctx.Output<framework::LoDTensor>("Out");
auto *in = ctx.Input<framework::LoDTensor>("X");
auto *shape_tensor = ctx.HasInput("Shape")
? ctx.Input<framework::LoDTensor>("Shape")
: nullptr;
framework::DDim out_dims = out->dims();
if (shape_tensor) {
auto *shape_data = shape_tensor->data<int>();
framework::Tensor cpu_shape_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>();
}
auto shape =
std::vector<int>(shape_data, shape_data + shape_tensor->numel());
out_dims = ReshapeOp::ValidateShape(shape, in->dims());
}
if (!in->lod().empty()) {
PADDLE_ENFORCE_EQ(
out_dims[0], in->dims()[0],
"Reshape operator cannot reshape an input sequence batch "
"into an output sequence batch that has a different "
"number of time steps. Please consider using "
"sequence_reshape op.");
}
bool inplace = ctx.Attr<bool>("inplace");
out->Resize(out_dims);
if (!inplace) {
out->mutable_data<T>(ctx.GetPlace());
framework::TensorCopySync(*in, ctx.GetPlace(), out);
out->Resize(out_dims);
} else {
out->ShareDataWith(*in);
out->Resize(out_dims);
}
}
};
template <typename DeviceContext, typename T>
class ReshapeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const {
auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
d_x->mutable_data<T>(ctx.GetPlace());
bool inplace = ctx.Attr<bool>("inplace");
auto in_dims = d_x->dims();
if (!inplace) {
framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
ctx.device_context().Wait();
d_x->Resize(in_dims);
} else {
d_x->ShareDataWith(*d_out);
d_x->Resize(in_dims);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
...@@ -135,15 +136,14 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -135,15 +136,14 @@ class WhileGradOp : public framework::OperatorBase {
auto &og_inside = auto &og_inside =
detail::Ref(cur_scope.Var(inside_og_name), detail::Ref(cur_scope.Var(inside_og_name),
"Cannot find inside gradient %s", inside_og_name); "Cannot find inside gradient %s", inside_og_name);
if (og_outside.Type().hash_code() == if (framework::IsType<framework::LoDTensor>(og_outside.Type())) {
typeid(framework::LoDTensor).hash_code()) {
auto &outside_tensor = og_outside.Get<framework::LoDTensor>(); auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
auto &inside_tensor = auto &inside_tensor =
detail::Ref(og_inside.GetMutable<framework::LoDTensor>()); detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
inside_tensor.set_lod(outside_tensor.lod()); inside_tensor.set_lod(outside_tensor.lod());
inside_tensor.ShareDataWith(outside_tensor); inside_tensor.ShareDataWith(outside_tensor);
} else if (og_outside.Type().hash_code() == } else if (framework::IsType<framework::LoDTensorArray>(
typeid(framework::LoDTensorArray).hash_code()) { og_outside.Type())) {
auto &outside_array = og_outside.Get<framework::LoDTensorArray>(); auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
auto &inside_array = auto &inside_array =
detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>()); detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
......
...@@ -113,7 +113,11 @@ template <typename... Args> ...@@ -113,7 +113,11 @@ template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
bool stat, const Args&... args) { bool stat, const Args&... args) {
if (UNLIKELY(!(stat))) { if (UNLIKELY(!(stat))) {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(string::Sprintf(args...)); throw std::runtime_error(string::Sprintf(args...));
#else
LOG(FATAL) << string::Sprintf(args...);
#endif
} }
} }
...@@ -123,8 +127,12 @@ template <typename... Args> ...@@ -123,8 +127,12 @@ template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudaError_t e, const Args&... args) { cudaError_t e, const Args&... args) {
if (UNLIKELY(e)) { if (UNLIKELY(e)) {
#ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(e, thrust::cuda_category(), throw thrust::system_error(e, thrust::cuda_category(),
string::Sprintf(args...)); string::Sprintf(args...));
#else
LOG(FATAL) << string::Sprintf(args...);
#endif
} }
} }
...@@ -132,8 +140,12 @@ template <typename... Args> ...@@ -132,8 +140,12 @@ template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
curandStatus_t stat, const Args&... args) { curandStatus_t stat, const Args&... args) {
if (stat != CURAND_STATUS_SUCCESS) { if (stat != CURAND_STATUS_SUCCESS) {
#ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
string::Sprintf(args...)); string::Sprintf(args...));
#else
LOG(FATAL) << string::Sprintf(args...);
#endif
} }
} }
...@@ -143,8 +155,12 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( ...@@ -143,8 +155,12 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
if (stat == CUDNN_STATUS_SUCCESS) { if (stat == CUDNN_STATUS_SUCCESS) {
return; return;
} else { } else {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
string::Sprintf(args...)); string::Sprintf(args...));
#else
LOG(FATAL) << string::Sprintf(args...);
#endif
} }
} }
...@@ -173,7 +189,11 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( ...@@ -173,7 +189,11 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
} else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
err = "CUBLAS: license error, "; err = "CUBLAS: license error, ";
} }
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(err + string::Sprintf(args...)); throw std::runtime_error(err + string::Sprintf(args...));
#else
LOG(FATAL) << err << string::Sprintf(args...);
#endif
} }
#ifndef __APPLE__ #ifndef __APPLE__
...@@ -183,8 +203,13 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( ...@@ -183,8 +203,13 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
if (stat == ncclSuccess) { if (stat == ncclSuccess) {
return; return;
} else { } else {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
string::Sprintf(args...)); string::Sprintf(args...));
#else
LOG(FATAL) << platform::dynload::ncclGetErrorString(stat)
<< string::Sprintf(args...);
#endif
} }
} }
#endif // __APPLE__ #endif // __APPLE__
...@@ -203,6 +228,7 @@ inline void throw_on_error(T e) { ...@@ -203,6 +228,7 @@ inline void throw_on_error(T e) {
__FILE__, __LINE__); \ __FILE__, __LINE__); \
} while (false) } while (false)
#ifndef REPLACE_ENFORCE_GLOG
#define PADDLE_ENFORCE(...) \ #define PADDLE_ENFORCE(...) \
do { \ do { \
try { \ try { \
...@@ -212,6 +238,9 @@ inline void throw_on_error(T e) { ...@@ -212,6 +238,9 @@ inline void throw_on_error(T e) {
__FILE__, __LINE__); \ __FILE__, __LINE__); \
} \ } \
} while (false) } while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif
/* /*
* Some enforce helpers here, usage: * Some enforce helpers here, usage:
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <mkldnn.h> #include <mkldnn.h>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -182,10 +183,11 @@ class MKLDNNHandler { ...@@ -182,10 +183,11 @@ class MKLDNNHandler {
} }
std::shared_ptr<mkldnn::memory> AcquireMemory( std::shared_ptr<mkldnn::memory> AcquireMemory(
mkldnn::memory::primitive_desc& mpd, mkldnn::memory::primitive_desc& mpd, // NOLINT
mkldnn::memory::primitive_desc& user_mpd, mkldnn::memory::primitive_desc& user_mpd, // NOLINT
const std::shared_ptr<mkldnn::memory> user_memory_p, const std::shared_ptr<mkldnn::memory> user_memory_p,
const std::string& suffix, std::vector<mkldnn::primitive>& pipeline) { const std::string& suffix,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
// create reorder primitive if the input format is not the preferred one // create reorder primitive if the input format is not the preferred one
auto local_key = key_ + suffix; auto local_key = key_ + suffix;
auto key_reorder_p = key_ + suffix + "reorder_p"; auto key_reorder_p = key_ + suffix + "reorder_p";
...@@ -218,7 +220,7 @@ class MKLDNNHandler { ...@@ -218,7 +220,7 @@ class MKLDNNHandler {
return target_memory_p; return target_memory_p;
} }
static std::string GetHash(mkldnn::memory::dims& operand_dims, static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT
const std::string& suffix) { const std::string& suffix) {
auto dims2str = [](const mkldnn::memory::dims& operand_dims) { auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
std::string dstr = ""; std::string dstr = "";
...@@ -227,8 +229,9 @@ class MKLDNNHandler { ...@@ -227,8 +229,9 @@ class MKLDNNHandler {
} }
return dstr; return dstr;
}; };
return dims2str(operand_dims) + suffix; return dims2str(operand_dims) + suffix;
}; }
protected: protected:
const MKLDNNDeviceContext& dev_ctx_; const MKLDNNDeviceContext& dev_ctx_;
...@@ -237,5 +240,15 @@ class MKLDNNHandler { ...@@ -237,5 +240,15 @@ class MKLDNNHandler {
bool is_reusing_; bool is_reusing_;
}; };
inline mkldnn::memory::format MKLDNNFormatForSize(
size_t dims_size, mkldnn::memory::format data_format) {
if (dims_size == 1) {
return mkldnn::memory::format::x;
} else if (dims_size == 2) {
return mkldnn::memory::format::nc;
}
return data_format;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -643,7 +643,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -643,7 +643,11 @@ All parameter, weight, gradient are variables in Paddle.
[](const BuildStrategy &self) { return self.debug_graphviz_path_; }, [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
[](BuildStrategy &self, const std::string &path) { [](BuildStrategy &self, const std::string &path) {
self.debug_graphviz_path_ = path; self.debug_graphviz_path_ = path;
}); })
.def_property(
"enable_data_balance",
[](const BuildStrategy &self) { return self.enable_data_balance_; },
[](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; });
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &, const std::unordered_set<std::string> &,
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册