提交 1800fef1 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into port_pybind11

...@@ -97,6 +97,14 @@ if(WITH_GPU) ...@@ -97,6 +97,14 @@ if(WITH_GPU)
endif() endif()
include_directories(${TENSORRT_INCLUDE_DIR}) include_directories(${TENSORRT_INCLUDE_DIR})
endif() endif()
if(WITH_ANAKIN)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
endif()
endif()
elseif(WITH_AMD_GPU) elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP) add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
......
...@@ -2,10 +2,22 @@ if (NOT WITH_ANAKIN) ...@@ -2,10 +2,22 @@ if (NOT WITH_ANAKIN)
return() return()
endif() endif()
set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH INCLUDE(ExternalProject)
"Anakin install path." FORCE) set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin)
set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files") # the anakin install dir is only default one now
set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") set(ANAKIN_INSTALL_DIR ${THIRD_PARTY_PATH}/anakin/src/extern_anakin/output)
set(ANAKIN_INCLUDE ${ANAKIN_INSTALL_DIR})
set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later.
set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}")
include_directories(${ANAKIN_INCLUDE})
include_directories(${ANAKIN_INCLUDE}/saber/)
set(ANAKIN_COMPILE_EXTRA_FLAGS set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
...@@ -20,36 +32,33 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS ...@@ -20,36 +32,33 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder -Wno-reorder
-Wno-error=cpp) -Wno-error=cpp)
set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz") ExternalProject_Add(
extern_anakin
# A helper function used in Anakin, currently, to use it, one need to recursively include ${EXTERNAL_PROJECT_LOG_ARGS}
# nearly all the header files. # TODO(luotao): use PaddlePaddle/Anakin later
function(fetch_include_recursively root_dir) GIT_REPOSITORY "https://github.com/luotao1/Anakin"
if (IS_DIRECTORY ${root_dir}) GIT_TAG "3957ae9263eaa0b1986758dac60a88852afb09be"
include_directories(${root_dir}) PREFIX ${ANAKIN_SOURCE_DIR}
endif() UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) -DUSE_X86_PLACE=YES
foreach(sub ${ALL_SUB}) -DBUILD_WITH_UNIT_TEST=NO
if (IS_DIRECTORY ${root_dir}/${sub}) -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
fetch_include_recursively(${root_dir}/${sub}) -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
endif() -DCUDNN_ROOT=${CUDNN_ROOT}
endforeach() ${EXTERNAL_OPTIONAL_ARGS}
endfunction() CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
)
if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
# download library
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
endif()
if (WITH_ANAKIN) message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
fetch_include_recursively(${ANAKIN_INCLUDE}) add_library(anakin_shared SHARED IMPORTED GLOBAL)
link_directories(${ANAKIN_LIBRARY}) set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
endif() add_dependencies(anakin_shared extern_anakin protobuf mklml)
add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin protobuf mklml)
list(APPEND external_project_dependencies anakin_shared anakin_saber)
...@@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU) ...@@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release ${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin) DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
list(APPEND inference_deps anakin_inference_lib) list(APPEND inference_deps anakin_inference_lib)
endif() endif()
......
...@@ -38,11 +38,3 @@ _switch_scope ...@@ -38,11 +38,3 @@ _switch_scope
.. autofunction:: paddle.fluid.executor._switch_scope .. autofunction:: paddle.fluid.executor._switch_scope
:noindex: :noindex:
.. _api_fluid_executor_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.executor.fetch_var
:noindex:
...@@ -106,22 +106,6 @@ _switch_scope ...@@ -106,22 +106,6 @@ _switch_scope
.. autofunction:: paddle.fluid._switch_scope .. autofunction:: paddle.fluid._switch_scope
:noindex: :noindex:
.. _api_fluid_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.fetch_var
:noindex:
.. _api_fluid_Go:
Go
--
.. autoclass:: paddle.fluid.Go
:members:
:noindex:
.. _api_fluid_make_channel: .. _api_fluid_make_channel:
......
...@@ -34,21 +34,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, ...@@ -34,21 +34,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None,
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
...@@ -62,20 +51,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', ' ...@@ -62,20 +51,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', '
paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False)) paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.InferenceTranspiler.__init__ paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0)) paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
...@@ -338,14 +323,11 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', ...@@ -338,14 +323,11 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.transpiler.InferenceTranspiler.__init__ paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
......
...@@ -44,13 +44,13 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -44,13 +44,13 @@ class DfgPassManagerImpl final : public DfgPassManager {
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
auto trt_teller = [&](const Node* node) { auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu"}); {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"});
if (!node->IsFunction()) return false; if (!node->IsFunction()) return false;
const auto* func = static_cast<const Function*>(node); const auto* func = static_cast<const Function*>(node);
if (teller_set.count(func->func_type())) if (teller_set.count(func->func_type())) {
return true; return true;
else { } else {
return false; return false;
} }
}; };
......
...@@ -45,7 +45,6 @@ endfunction(inference_api_test) ...@@ -45,7 +45,6 @@ endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor) cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS api_tester.cc SRCS api_tester.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
...@@ -62,22 +61,18 @@ inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_e ...@@ -62,22 +61,18 @@ inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_e
endif() endif()
if (WITH_ANAKIN) # only needed in CI if (WITH_ANAKIN) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
fetch_include_recursively(${ANAKIN_INCLUDE})
# compile the libinference_anakin_api.a and anakin.so. # compile the libinference_anakin_api.a and anakin.so.
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc) nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc) #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) function(anakin_target target_name)
target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_link_libraries(inference_anakin_api anakin anakin_saber_common) endfunction()
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) anakin_target(inference_anakin_api)
#anakin_target(inference_anakin_api_shared)
if (WITH_TESTING) if (WITH_TESTING)
# this test is unstable, disable it first. cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
#cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
#ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin DEPS inference_anakin_api dynload_cuda SERIAL)
#DEPS inference_anakin_api_shared) target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
#target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endif(WITH_TESTING)
endif(WITH_TESTING)
endif() endif()
...@@ -45,7 +45,7 @@ class PaddleBuf { ...@@ -45,7 +45,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Own memory. // Own memory.
PaddleBuf(size_t length) explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes. // Resize to `length` bytes.
void Resize(size_t length); void Resize(size_t length);
......
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
activation_op.cc activation_op.cc softmax_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry) DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
...@@ -21,3 +21,6 @@ nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc ...@@ -21,3 +21,6 @@ nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights.
*/
class SoftMaxOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4)
<< "convert a fluid softmax op to tensorrt softmax layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax,
*const_cast<nvinfer1::ITensor*>(input1));
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(SoftMaxOpConverter, main) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(8, parameters, scope, 1000);
std::vector<int> tensor_shape{8, 10};
validator.DeclInputVar("softmax-X", tensor_shape,
nvinfer1::DimsCHW(10, 1, 1));
validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("softmax");
desc.SetInput("X", {"softmax-X"});
desc.SetOutput("Out", {"softmax-Out"});
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(3);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
...@@ -79,6 +79,12 @@ class TRTConvertValidation { ...@@ -79,6 +79,12 @@ class TRTConvertValidation {
} }
// Declare a Variable as input with random initialization. // Declare a Variable as input with random initialization.
void DeclInputVar(const std::string& name, const std::vector<int> tensor_dims,
const nvinfer1::Dims& trt_dims) {
DeclVar(name, tensor_dims);
engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims);
}
void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) { void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims); DeclVar(name, dims);
// Declare TRT inputs. // Declare TRT inputs.
...@@ -94,12 +100,18 @@ class TRTConvertValidation { ...@@ -94,12 +100,18 @@ class TRTConvertValidation {
DeclVar(name, dims); DeclVar(name, dims);
} }
// Declare a variable in a fluid Scope. void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
platform::CPUPlace place; platform::CPUPlace place;
platform::CPUDeviceContext ctx(place); platform::CPUDeviceContext ctx(place);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
// Init Fluid tensor. // Init Fluid tensor.
std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims); std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
// There is no batchsize in ITensor's shape, but We should add it to // There is no batchsize in ITensor's shape, but We should add it to
...@@ -107,10 +119,8 @@ class TRTConvertValidation { ...@@ -107,10 +119,8 @@ class TRTConvertValidation {
// if_add_batch_ flag is true, add the max batchsize to dim_vec. // if_add_batch_ flag is true, add the max batchsize to dim_vec.
if (is_param != true && if_add_batch_ == true) if (is_param != true && if_add_batch_ == true)
dim_vec.insert(dim_vec.begin(), max_batch_size_); dim_vec.insert(dim_vec.begin(), max_batch_size_);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); DeclVar(name, dim_vec);
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
} }
void SetOp(const framework::proto::OpDesc& desc) { void SetOp(const framework::proto::OpDesc& desc) {
......
...@@ -16,60 +16,6 @@ limitations under the License. */ ...@@ -16,60 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void ElementwiseAddCUDAKernel(const T *x, const T *y, T *z, int n,
int post, int size) {
int idx_x = threadIdx.x + blockIdx.x * blockDim.x;
if (idx_x < size) {
int idx_y = idx_x / post - (idx_x / (n * post)) * n;
z[idx_x] = x[idx_x] + y[idx_y];
}
}
template <typename T>
class ElementwiseAddKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
using Tensor = framework::Tensor;
const auto x = ctx.Input<Tensor>("X");
const auto y = ctx.Input<Tensor>("Y");
auto z = ctx.Output<Tensor>("Out");
auto *z_data = z->mutable_data<T>(ctx.GetPlace());
auto &device = *(ctx.cuda_device_context().eigen_device());
const framework::DDim &x_dim = x->dims();
framework::DDim y_dim = y->dims();
int size = x->numel();
if (x_dim == y_dim) {
auto dim = framework::make_ddim({size});
auto z_eigen = framework::EigenTensor<T, 1>::From(*z, dim);
auto x_eigen = framework::EigenTensor<T, 1>::From(*x, dim);
auto y_eigen = framework::EigenTensor<T, 1>::From(*y, dim);
z_eigen.device(device) = x_eigen + y_eigen;
} else {
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
y_dim = trim_trailing_singular_dims(y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
int threads = 512;
int grids = (size + threads - 1) / threads;
auto stream = ctx.cuda_device_context().stream();
ElementwiseAddCUDAKernel<T><<<grids, threads, 0, stream>>>(
x->data<T>(), y->data<T>(), z_data, n, post, size);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
......
...@@ -144,41 +144,16 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> { ...@@ -144,41 +144,16 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
// skip out, x, y
auto* out = dout;
auto *x = dout, *y = dout;
if (dx != nullptr) { if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr &&
// In fact, we can just share memory, but it may cause a bug of memory dy != nullptr && (dx->dims() == dy->dims())) {
// optimizer elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
// dx->ShareDataWith(*dout);
framework::TensorCopy(*dout, ctx.GetPlace(),
ctx.template device_context<DeviceContext>(), dx);
}
if (dy == nullptr) return;
const framework::DDim& x_dim = dout->dims();
framework::DDim y_dim = dy->dims();
if (x_dim == y_dim) {
// dy->ShareDataWith(*dout);
framework::TensorCopy(*dout, ctx.GetPlace(),
ctx.template device_context<DeviceContext>(), dy);
} else { } else {
dy->mutable_data<T>(ctx.GetPlace()); default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
// Perform reduction to dout to calculate dy dy);
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
y_dim = trim_trailing_singular_dims(y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
auto& device =
*(ctx.template device_context<DeviceContext>().eigen_device());
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
auto eigen_dout = framework::EigenTensor<T, 3>::From(
*dout, framework::make_ddim({pre, n, post}));
auto eigen_dy =
framework::EigenTensor<T, 1>::From(*dy, framework::make_ddim({n}));
eigen_dy.device(device) = eigen_dout.sum(
framework::EigenDim<2>::From(framework::make_ddim({0, 2})));
} }
} }
}; };
......
...@@ -666,7 +666,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -666,7 +666,7 @@ All parameter, weight, gradient are variables in Paddle.
const std::string &, Scope *, std::vector<Scope *> &, const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t, const ExecutionStrategy &, const BuildStrategy &, size_t,
size_t>()) size_t>())
.def("bcast_params", &ParallelExecutor::BCastParamsToDevices) .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices)
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element // We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope* // of vec<Scope*> will be freed by Python GC. We can only return Scope*
......
...@@ -48,8 +48,6 @@ from .data_feeder import DataFeeder ...@@ -48,8 +48,6 @@ from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from .transpiler import DistributeTranspiler, InferenceTranspiler, \ from .transpiler import DistributeTranspiler, InferenceTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig memory_optimize, release_memory, DistributeTranspilerConfig
from .concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close, Select)
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip from . import clip
from . import profiler from . import profiler
...@@ -61,7 +59,7 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable ...@@ -61,7 +59,7 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ __all__ = framework.__all__ + executor.__all__ + \
trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + lod_tensor.__all__ + [ parallel_executor.__all__ + lod_tensor.__all__ + [
'io', 'io',
......
...@@ -19,8 +19,7 @@ from .layers import fill_constant ...@@ -19,8 +19,7 @@ from .layers import fill_constant
from . import core from . import core
__all__ = [ __all__ = [
'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select'
'Select'
] ]
...@@ -35,10 +34,10 @@ class Go(BlockGuard): ...@@ -35,10 +34,10 @@ class Go(BlockGuard):
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None: if exc_type is not None:
return False return False
self.construct_go_op() self._construct_go_op()
return super(Go, self).__exit__(exc_type, exc_val, exc_tb) return super(Go, self).__exit__(exc_type, exc_val, exc_tb)
def construct_go_op(self): def _construct_go_op(self):
main_program = self.helper.main_program main_program = self.helper.main_program
go_block = main_program.current_block() go_block = main_program.current_block()
parent_block = main_program.block(main_program.current_block() parent_block = main_program.block(main_program.current_block()
......
...@@ -18,9 +18,7 @@ import six ...@@ -18,9 +18,7 @@ import six
from .framework import Program, default_main_program, Variable from .framework import Program, default_main_program, Variable
from . import core from . import core
__all__ = [ __all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope']
'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var'
]
g_scope = core.Scope() g_scope = core.Scope()
...@@ -171,7 +169,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): ...@@ -171,7 +169,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
return fetch_count > 0 return fetch_count > 0
def fetch_var(name, scope=None, return_numpy=True): def _fetch_var(name, scope=None, return_numpy=True):
""" """
Fetch the value of the variable with the given name from the Fetch the value of the variable with the given name from the
given scope. given scope.
...@@ -222,6 +220,37 @@ def _get_program_cache_key(feed, fetch_list): ...@@ -222,6 +220,37 @@ def _get_program_cache_key(feed, fetch_list):
return str(feed_var_names + fetch_var_names) return str(feed_var_names + fetch_var_names)
def _as_lodtensor(data, place):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> place = fluid.CPUPlace()
>>> exe = fluid.executor(place)
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \
ndarray to LoDTensor. Please convert data to LoDTensor \
directly before feeding the data.\
")
# single tensor case
tensor = core.LoDTensor()
tensor.set(data, place)
return tensor
class Executor(object): class Executor(object):
""" """
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
...@@ -250,35 +279,6 @@ class Executor(object): ...@@ -250,35 +279,6 @@ class Executor(object):
self.program_caches = dict() self.program_caches = dict()
self._closed = False self._closed = False
def as_lodtensor(self, data):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> exe = fluid.executor(fluid.CPUPlace())
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \
ndarray to LoDTensor. Please convert data to LoDTensor \
directly before feeding the data.\
")
# single tensor case
tensor = core.LoDTensor()
tensor.set(data, self.place)
return tensor
def _get_program_cache(self, program_cache_key): def _get_program_cache(self, program_cache_key):
return self.program_caches.get(program_cache_key, None) return self.program_caches.get(program_cache_key, None)
...@@ -338,7 +338,7 @@ class Executor(object): ...@@ -338,7 +338,7 @@ class Executor(object):
feed_target_name = op.desc.output('Out')[0] feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name] cur_feed = feed[feed_target_name]
if not isinstance(cur_feed, core.LoDTensor): if not isinstance(cur_feed, core.LoDTensor):
cur_feed = self.as_lodtensor(cur_feed) cur_feed = _as_lodtensor(cur_feed, self.place)
idx = op.desc.attr('col') idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx) core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else: else:
......
...@@ -279,19 +279,19 @@ class ParallelExecutor(object): ...@@ -279,19 +279,19 @@ class ParallelExecutor(object):
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if self.is_dist: if self.is_dist:
self.bcast_params() self._bcast_params()
if return_numpy: if return_numpy:
return executor.as_numpy(arr) return executor.as_numpy(arr)
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
def bcast_params(self): def _bcast_params(self):
""" """
Broadcast the parameters to other devices. It is used during Broadcast the parameters to other devices. It is used during
distributed training. distributed training.
""" """
self.executor.bcast_params(set(self.persistable_vars)) self.executor._bcast_params(set(self.persistable_vars))
@property @property
def device_count(self): def device_count(self):
......
...@@ -537,5 +537,35 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ...@@ -537,5 +537,35 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
class TestRMSPropOptimizer(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
optimizer.minimize(avg_cost)
return
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
pserver2, startup2 = self.get_pserver(self.pserver2_ep)
self.assertEqual(len(pserver.blocks), 3)
# block1~2: optimize pass
self.assertEqual([op.type for op in pserver.blocks[1].ops],
["sum", "scale", "rmsprop"])
# the variable #fc_w will be split into two blocks
fc_w_var = startup.global_block().var("fc_w.block1")
self.assertEqual(fc_w_var.shape, (500, 1000))
moment_var = startup.global_block().var("momentum_1")
self.assertEqual(moment_var.shape, (500, 1000))
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -26,7 +26,7 @@ class TestFetchVar(op_test.OpTest): ...@@ -26,7 +26,7 @@ class TestFetchVar(op_test.OpTest):
layers.assign(input=val, output=x) layers.assign(input=val, output=x)
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_main_program(), feed={}, fetch_list=[]) exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
fetched_x = fluid.fetch_var("x") fetched_x = fluid.executor._fetch_var("x")
self.assertTrue( self.assertTrue(
numpy.array_equal(fetched_x, val), numpy.array_equal(fetched_x, val),
"fetch_x=%s val=%s" % (fetched_x, val)) "fetch_x=%s val=%s" % (fetched_x, val))
......
...@@ -62,7 +62,8 @@ class TestPyReader(unittest.TestCase): ...@@ -62,7 +62,8 @@ class TestPyReader(unittest.TestCase):
next_data = np.random.uniform( next_data = np.random.uniform(
low=0, high=1000, low=0, high=1000,
size=(batch_size, ) + shape[1:]).astype(dtype) size=(batch_size, ) + shape[1:]).astype(dtype)
in_data.append(executor.as_lodtensor(next_data)) in_data.append(
fluid.executor._as_lodtensor(next_data, place))
self.inputs.append(in_data) self.inputs.append(in_data)
......
...@@ -22,7 +22,7 @@ pos_enc_param_names = ( ...@@ -22,7 +22,7 @@ pos_enc_param_names = (
"src_pos_enc_table", "src_pos_enc_table",
"trg_pos_enc_table", ) "trg_pos_enc_table", )
batch_size = 64 batch_size = 2
def position_encoding_init(n_position, d_pos_vec): def position_encoding_init(n_position, d_pos_vec):
......
...@@ -751,14 +751,14 @@ class DistributeTranspiler(object): ...@@ -751,14 +751,14 @@ class DistributeTranspiler(object):
out_name = op.output("Out") out_name = op.output("Out")
ids_var = program.global_block().vars[ids_name[0]] ids_var = program.global_block().vars[ids_name[0]]
prefetch_input_vars = self.create_splited_vars( prefetch_input_vars = self._create_splited_vars(
source_var=ids_var, source_var=ids_var,
block=program.global_block(), block=program.global_block(),
tag="_prefetch_in_") tag="_prefetch_in_")
self.all_prefetch_input_vars.append(prefetch_input_vars) self.all_prefetch_input_vars.append(prefetch_input_vars)
out_var = program.global_block().vars[out_name[0]] out_var = program.global_block().vars[out_name[0]]
prefetch_output_vars = self.create_splited_vars( prefetch_output_vars = self._create_splited_vars(
source_var=out_var, source_var=out_var,
block=program.global_block(), block=program.global_block(),
tag="_prefetch_out_") tag="_prefetch_out_")
...@@ -1040,7 +1040,7 @@ class DistributeTranspiler(object): ...@@ -1040,7 +1040,7 @@ class DistributeTranspiler(object):
program.global_block()._sync_with_cpp() program.global_block()._sync_with_cpp()
return var_mapping return var_mapping
def create_splited_vars(self, source_var, block, tag): def _create_splited_vars(self, source_var, block, tag):
return [ return [
block.create_var( block.create_var(
name=str(source_var.name + tag + str(index)), name=str(source_var.name + tag + str(index)),
...@@ -1184,18 +1184,39 @@ class DistributeTranspiler(object): ...@@ -1184,18 +1184,39 @@ class DistributeTranspiler(object):
program = optimize_block.program program = optimize_block.program
pserver_block = program.global_block() pserver_block = program.global_block()
new_inputs = collections.OrderedDict() new_inputs = collections.OrderedDict()
# update param/grad shape first, then other inputs like # update param/grad shape first, then other inputs like
# moment can use the updated shape # moment can use the updated shape
def _get_param_block(opt_op):
# param is already created on global program
param_block = None
for p in self.param_grad_ep_mapping[endpoint]["params"]:
if same_or_split_var(p.name, opt_op.input("Param")[0]):
param_block = p
break
return param_block
for key in opt_op.input_names: for key in opt_op.input_names:
if key == "Grad": if key == "Grad":
new_inputs[key] = merged_var new_inputs[key] = merged_var
# For RMSProp optimizer
elif key == "Moment" or key == "MeanSquare":
param_block = _get_param_block(opt_op)
if not param_block:
return
moment_var = origin_program.global_block().vars[opt_op.input(
key)[0]]
tmpvar = pserver_block.create_var(
name=moment_var.name,
persistable=moment_var.persistable,
dtype=moment_var.dtype,
# change to use same shape as param
# TODO(typhoonzero): didn't append .block in the var name,
# may affect checkpoint saving? Need to verify.
shape=param_block.shape)
new_inputs[key] = tmpvar
elif key == "Param": elif key == "Param":
# param is already created on global program param_block = _get_param_block(opt_op)
param_block = None
for p in self.param_grad_ep_mapping[endpoint]["params"]:
if same_or_split_var(p.name, opt_op.input(key)[0]):
param_block = p
break
if not param_block: if not param_block:
return return
tmpvar = pserver_block.create_var( tmpvar = pserver_block.create_var(
...@@ -1221,7 +1242,7 @@ class DistributeTranspiler(object): ...@@ -1221,7 +1242,7 @@ class DistributeTranspiler(object):
for key in opt_op.input_names: for key in opt_op.input_names:
new_shape = None new_shape = None
if key in ["Param", "Grad", "LearningRate"]: if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]:
continue continue
var = self.origin_program.global_block().vars[opt_op.input(key)[0]] var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
# update accumulator variable shape # update accumulator variable shape
......
...@@ -57,10 +57,10 @@ class InferenceTranspiler(object): ...@@ -57,10 +57,10 @@ class InferenceTranspiler(object):
scope = global_scope() scope = global_scope()
if not isinstance(scope, core.Scope): if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
self.fuse_batch_norm(program, place, scope) self._fuse_batch_norm(program, place, scope)
self.fuse_relu_mkldnn(program) self._fuse_relu_mkldnn(program)
def fuse_relu_mkldnn(self, program): def _fuse_relu_mkldnn(self, program):
''' '''
Transpile the program by fused relu activation for MKLDNN program. Transpile the program by fused relu activation for MKLDNN program.
...@@ -104,7 +104,7 @@ class InferenceTranspiler(object): ...@@ -104,7 +104,7 @@ class InferenceTranspiler(object):
# And a better solution will be considered later. # And a better solution will be considered later.
program = program.clone() program = program.clone()
def fuse_batch_norm(self, program, place, scope): def _fuse_batch_norm(self, program, place, scope):
''' '''
Transpile the program by fused batch normalization. Transpile the program by fused batch normalization.
......
requests==2.9.2 requests==2.9.2
numpy>=1.12 numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
protobuf==3.1 protobuf==3.1
recordio>=0.1.0 recordio>=0.1.0
matplotlib matplotlib
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册