提交 0098a494 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into refine/op/fc

...@@ -204,12 +204,11 @@ include(external/snappy) # download snappy ...@@ -204,12 +204,11 @@ include(external/snappy) # download snappy
include(external/snappystream) include(external/snappystream)
include(external/threadpool) include(external/threadpool)
set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
if(WITH_GPU) if(WITH_GPU)
include(cuda) include(cuda)
include(tensorrt) include(tensorrt)
include(external/anakin) include(external/anakin)
else()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
endif() endif()
include(cudnn) # set cudnn libraries, must before configure include(cudnn) # set cudnn libraries, must before configure
......
...@@ -97,6 +97,14 @@ if(WITH_GPU) ...@@ -97,6 +97,14 @@ if(WITH_GPU)
endif() endif()
include_directories(${TENSORRT_INCLUDE_DIR}) include_directories(${TENSORRT_INCLUDE_DIR})
endif() endif()
if(WITH_ANAKIN)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
endif()
endif()
elseif(WITH_AMD_GPU) elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP) add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
......
...@@ -2,10 +2,22 @@ if (NOT WITH_ANAKIN) ...@@ -2,10 +2,22 @@ if (NOT WITH_ANAKIN)
return() return()
endif() endif()
set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH INCLUDE(ExternalProject)
"Anakin install path." FORCE) set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin)
set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files") # the anakin install dir is only default one now
set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") set(ANAKIN_INSTALL_DIR ${THIRD_PARTY_PATH}/anakin/src/extern_anakin/output)
set(ANAKIN_INCLUDE ${ANAKIN_INSTALL_DIR})
set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later.
set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}")
include_directories(${ANAKIN_INCLUDE})
include_directories(${ANAKIN_INCLUDE}/saber/)
set(ANAKIN_COMPILE_EXTRA_FLAGS set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
...@@ -20,36 +32,33 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS ...@@ -20,36 +32,33 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder -Wno-reorder
-Wno-error=cpp) -Wno-error=cpp)
set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz") ExternalProject_Add(
extern_anakin
# A helper function used in Anakin, currently, to use it, one need to recursively include ${EXTERNAL_PROJECT_LOG_ARGS}
# nearly all the header files. # TODO(luotao): use PaddlePaddle/Anakin later
function(fetch_include_recursively root_dir) GIT_REPOSITORY "https://github.com/luotao1/Anakin"
if (IS_DIRECTORY ${root_dir}) GIT_TAG "3957ae9263eaa0b1986758dac60a88852afb09be"
include_directories(${root_dir}) PREFIX ${ANAKIN_SOURCE_DIR}
endif() UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) -DUSE_X86_PLACE=YES
foreach(sub ${ALL_SUB}) -DBUILD_WITH_UNIT_TEST=NO
if (IS_DIRECTORY ${root_dir}/${sub}) -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
fetch_include_recursively(${root_dir}/${sub}) -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
endif() -DCUDNN_ROOT=${CUDNN_ROOT}
endforeach() ${EXTERNAL_OPTIONAL_ARGS}
endfunction() CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
)
if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
# download library
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
endif()
if (WITH_ANAKIN) message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
fetch_include_recursively(${ANAKIN_INCLUDE}) add_library(anakin_shared SHARED IMPORTED GLOBAL)
link_directories(${ANAKIN_LIBRARY}) set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
endif() add_dependencies(anakin_shared extern_anakin protobuf mklml)
add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin protobuf mklml)
list(APPEND external_project_dependencies anakin_shared anakin_saber)
...@@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU) ...@@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release ${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin) DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
list(APPEND inference_deps anakin_inference_lib) list(APPEND inference_deps anakin_inference_lib)
endif() endif()
......
...@@ -38,11 +38,3 @@ _switch_scope ...@@ -38,11 +38,3 @@ _switch_scope
.. autofunction:: paddle.fluid.executor._switch_scope .. autofunction:: paddle.fluid.executor._switch_scope
:noindex: :noindex:
.. _api_fluid_executor_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.executor.fetch_var
:noindex:
...@@ -106,22 +106,6 @@ _switch_scope ...@@ -106,22 +106,6 @@ _switch_scope
.. autofunction:: paddle.fluid._switch_scope .. autofunction:: paddle.fluid._switch_scope
:noindex: :noindex:
.. _api_fluid_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.fetch_var
:noindex:
.. _api_fluid_Go:
Go
--
.. autoclass:: paddle.fluid.Go
:members:
:noindex:
.. _api_fluid_make_channel: .. _api_fluid_make_channel:
......
...@@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N ...@@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N
paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
...@@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No ...@@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No
paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
...@@ -34,21 +37,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, ...@@ -34,21 +37,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None,
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
...@@ -62,20 +54,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', ' ...@@ -62,20 +54,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', '
paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False)) paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.InferenceTranspiler.__init__ paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0)) paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
...@@ -89,7 +77,7 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro ...@@ -89,7 +77,7 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro
paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
...@@ -338,14 +326,11 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', ...@@ -338,14 +326,11 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.transpiler.InferenceTranspiler.__init__ paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
......
...@@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const { ...@@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
} }
} }
int OpDesc::GetBlockAttr(const std::string &name) const { std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
auto blocks = boost::get<std::vector<BlockDesc *>>(it->second);
std::vector<int> ids;
for (auto n : blocks) {
ids.push_back(n->ID());
}
return ids;
}
int OpDesc::GetBlockAttrId(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
return boost::get<BlockDesc *>(it->second)->ID(); return boost::get<BlockDesc *>(it->second)->ID();
......
...@@ -83,7 +83,9 @@ class OpDesc { ...@@ -83,7 +83,9 @@ class OpDesc {
Attribute GetNullableAttr(const std::string &name) const; Attribute GetNullableAttr(const std::string &name) const;
int GetBlockAttr(const std::string &name) const; int GetBlockAttrId(const std::string &name) const;
std::vector<int> GetBlocksAttrIds(const std::string &name) const;
void Rename(const std::string &old_name, const std::string &new_name); void Rename(const std::string &old_name, const std::string &new_name);
......
...@@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { ...@@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
for (const std::string &attr_name : op->AttrNames()) { for (const std::string &attr_name : op->AttrNames()) {
if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) { if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
int sub_block_id = int sub_block_id =
o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name); o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
} }
} }
......
...@@ -44,13 +44,13 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -44,13 +44,13 @@ class DfgPassManagerImpl final : public DfgPassManager {
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
auto trt_teller = [&](const Node* node) { auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu"}); {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"});
if (!node->IsFunction()) return false; if (!node->IsFunction()) return false;
const auto* func = static_cast<const Function*>(node); const auto* func = static_cast<const Function*>(node);
if (teller_set.count(func->func_type())) if (teller_set.count(func->func_type())) {
return true; return true;
else { } else {
return false; return false;
} }
}; };
......
...@@ -45,7 +45,6 @@ endfunction(inference_api_test) ...@@ -45,7 +45,6 @@ endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor) cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS api_tester.cc SRCS api_tester.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
...@@ -62,22 +61,18 @@ inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_e ...@@ -62,22 +61,18 @@ inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_e
endif() endif()
if (WITH_ANAKIN) # only needed in CI if (WITH_ANAKIN) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
fetch_include_recursively(${ANAKIN_INCLUDE})
# compile the libinference_anakin_api.a and anakin.so. # compile the libinference_anakin_api.a and anakin.so.
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc) nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc) #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) function(anakin_target target_name)
target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_link_libraries(inference_anakin_api anakin anakin_saber_common) endfunction()
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) anakin_target(inference_anakin_api)
#anakin_target(inference_anakin_api_shared)
if (WITH_TESTING) if (WITH_TESTING)
# this test is unstable, disable it first. cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
#cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
#ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin DEPS inference_anakin_api dynload_cuda SERIAL)
#DEPS inference_anakin_api_shared) target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
#target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endif(WITH_TESTING)
endif(WITH_TESTING)
endif() endif()
...@@ -45,7 +45,7 @@ class PaddleBuf { ...@@ -45,7 +45,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Own memory. // Own memory.
PaddleBuf(size_t length) explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes. // Resize to `length` bytes.
void Resize(size_t length); void Resize(size_t length);
......
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
activation_op.cc activation_op.cc softmax_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry) DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
...@@ -21,3 +21,6 @@ nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc ...@@ -21,3 +21,6 @@ nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights.
*/
class SoftMaxOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4)
<< "convert a fluid softmax op to tensorrt softmax layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax,
*const_cast<nvinfer1::ITensor*>(input1));
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(SoftMaxOpConverter, main) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(8, parameters, scope, 1000);
std::vector<int> tensor_shape{8, 10};
validator.DeclInputVar("softmax-X", tensor_shape,
nvinfer1::DimsCHW(10, 1, 1));
validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("softmax");
desc.SetInput("X", {"softmax-X"});
desc.SetOutput("Out", {"softmax-Out"});
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(3);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
...@@ -79,6 +79,12 @@ class TRTConvertValidation { ...@@ -79,6 +79,12 @@ class TRTConvertValidation {
} }
// Declare a Variable as input with random initialization. // Declare a Variable as input with random initialization.
void DeclInputVar(const std::string& name, const std::vector<int> tensor_dims,
const nvinfer1::Dims& trt_dims) {
DeclVar(name, tensor_dims);
engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims);
}
void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) { void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims); DeclVar(name, dims);
// Declare TRT inputs. // Declare TRT inputs.
...@@ -94,12 +100,18 @@ class TRTConvertValidation { ...@@ -94,12 +100,18 @@ class TRTConvertValidation {
DeclVar(name, dims); DeclVar(name, dims);
} }
// Declare a variable in a fluid Scope. void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
platform::CPUPlace place; platform::CPUPlace place;
platform::CPUDeviceContext ctx(place); platform::CPUDeviceContext ctx(place);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
// Init Fluid tensor. // Init Fluid tensor.
std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims); std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
// There is no batchsize in ITensor's shape, but We should add it to // There is no batchsize in ITensor's shape, but We should add it to
...@@ -107,10 +119,8 @@ class TRTConvertValidation { ...@@ -107,10 +119,8 @@ class TRTConvertValidation {
// if_add_batch_ flag is true, add the max batchsize to dim_vec. // if_add_batch_ flag is true, add the max batchsize to dim_vec.
if (is_param != true && if_add_batch_ == true) if (is_param != true && if_add_batch_ == true)
dim_vec.insert(dim_vec.begin(), max_batch_size_); dim_vec.insert(dim_vec.begin(), max_batch_size_);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); DeclVar(name, dim_vec);
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
} }
void SetOp(const framework::proto::OpDesc& desc) { void SetOp(const framework::proto::OpDesc& desc) {
......
...@@ -235,7 +235,12 @@ else() ...@@ -235,7 +235,12 @@ else()
endif() endif()
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) if(WITH_GPU)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub)
else()
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
endif()
op_library(softmax_op DEPS softmax) op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND) if (WITH_GPU AND TENSORRT_FOUND)
...@@ -273,9 +278,9 @@ op_library(squeeze_op DEPS reshape_op) ...@@ -273,9 +278,9 @@ op_library(squeeze_op DEPS reshape_op)
op_library(extract_rows_op DEPS memory) op_library(extract_rows_op DEPS memory)
op_library(flatten_op DEPS reshape_op) op_library(flatten_op DEPS reshape_op)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub)
else() else()
op_library(conv_op DEPS vol2col im2col) op_library(conv_op DEPS vol2col im2col)
endif() endif()
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -188,6 +188,7 @@ namespace ops = paddle::operators; ...@@ -188,6 +188,7 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker, REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(crop_grad, ops::CropOpGrad); REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>); REGISTER_OP_CPU_KERNEL(
crop, ops::CropKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>); crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/crop_op.h" #include "paddle/fluid/operators/crop_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>); REGISTER_OP_CUDA_KERNEL(
crop, ops::CropKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>); crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -58,32 +58,74 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) { ...@@ -58,32 +58,74 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
return res; return res;
} }
template <typename T> template <typename DeviceContext, typename T, size_t D>
void CropFunction(const framework::ExecutionContext& context) {
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
auto out_dims = out->dims();
if (out_dims[0] == -1) {
out_dims[0] = x->dims()[0];
}
out->mutable_data<T>(out_dims, context.GetPlace());
auto x_stride = framework::stride(x->dims());
auto out_stride = framework::stride(out->dims());
auto offsets = GetOffsets(context);
int64_t offset = 0;
for (size_t i = 0; i < offsets.size(); ++i) {
offset += (x_stride[i] * offsets[i]);
}
auto x_tensor = EigenTensor<T, D>::From(*x);
auto out_tensor = EigenTensor<T, D>::From(*out);
Eigen::array<int, D> e_offsets;
Eigen::array<int, D> e_shape;
for (size_t i = 0; i < D; ++i) {
e_offsets[i] = offsets[i];
e_shape[i] = out->dims()[i];
}
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape);
}
template <typename DeviceContext, typename T>
class CropKernel : public framework::OpKernel<T> { class CropKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X"); int rank = context.Input<Tensor>("X")->dims().size();
auto* out = context.Output<Tensor>("Out"); switch (rank) {
const T* x_data = x->data<T>(); case 1:
T* out_data = out->mutable_data<T>(context.GetPlace()); CropFunction<DeviceContext, T, 1>(context);
auto x_stride = framework::stride(x->dims()); break;
auto out_stride = framework::stride(out->dims()); case 2:
auto offsets = GetOffsets(context); CropFunction<DeviceContext, T, 2>(context);
int64_t offset = 0; break;
for (size_t i = 0; i < offsets.size(); ++i) { case 3:
offset += (x_stride[i] * offsets[i]); CropFunction<DeviceContext, T, 3>(context);
break;
case 4:
CropFunction<DeviceContext, T, 4>(context);
break;
case 5:
CropFunction<DeviceContext, T, 5>(context);
break;
case 6:
CropFunction<DeviceContext, T, 6>(context);
break;
default:
PADDLE_THROW(
"CropOp only support tensors with no more than 6 dimensions.");
} }
StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
out->dims(), out_stride, out_data);
} }
}; };
template <typename DeviceContext, typename T, size_t D> template <typename DeviceContext, typename T, size_t D>
void CropGradFunction(const framework::ExecutionContext& context) { void CropGradFunction(const framework::ExecutionContext& context) {
auto* d_x = context.Output<Tensor>(framework::GradVarName("X")); auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
auto* x = context.Input<Tensor>("X");
if (d_x != nullptr) { if (d_x != nullptr) {
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out")); auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
d_x->mutable_data<T>(context.GetPlace()); d_x->mutable_data<T>(x->dims(), context.GetPlace());
auto offsets = GetOffsets(context); auto offsets = GetOffsets(context);
Eigen::array<std::pair<int, int>, D> paddings; Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < D; ++i) { for (size_t i = 0; i < D; ++i) {
......
...@@ -16,60 +16,6 @@ limitations under the License. */ ...@@ -16,60 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void ElementwiseAddCUDAKernel(const T *x, const T *y, T *z, int n,
int post, int size) {
int idx_x = threadIdx.x + blockIdx.x * blockDim.x;
if (idx_x < size) {
int idx_y = idx_x / post - (idx_x / (n * post)) * n;
z[idx_x] = x[idx_x] + y[idx_y];
}
}
template <typename T>
class ElementwiseAddKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
using Tensor = framework::Tensor;
const auto x = ctx.Input<Tensor>("X");
const auto y = ctx.Input<Tensor>("Y");
auto z = ctx.Output<Tensor>("Out");
auto *z_data = z->mutable_data<T>(ctx.GetPlace());
auto &device = *(ctx.cuda_device_context().eigen_device());
const framework::DDim &x_dim = x->dims();
framework::DDim y_dim = y->dims();
int size = x->numel();
if (x_dim == y_dim) {
auto dim = framework::make_ddim({size});
auto z_eigen = framework::EigenTensor<T, 1>::From(*z, dim);
auto x_eigen = framework::EigenTensor<T, 1>::From(*x, dim);
auto y_eigen = framework::EigenTensor<T, 1>::From(*y, dim);
z_eigen.device(device) = x_eigen + y_eigen;
} else {
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
y_dim = trim_trailing_singular_dims(y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
int threads = 512;
int grids = (size + threads - 1) / threads;
auto stream = ctx.cuda_device_context().stream();
ElementwiseAddCUDAKernel<T><<<grids, threads, 0, stream>>>(
x->data<T>(), y->data<T>(), z_data, n, post, size);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
......
...@@ -144,41 +144,16 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> { ...@@ -144,41 +144,16 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
// skip out, x, y
auto* out = dout;
auto *x = dout, *y = dout;
if (dx != nullptr) { if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr &&
// In fact, we can just share memory, but it may cause a bug of memory dy != nullptr && (dx->dims() == dy->dims())) {
// optimizer elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
// dx->ShareDataWith(*dout);
framework::TensorCopy(*dout, ctx.GetPlace(),
ctx.template device_context<DeviceContext>(), dx);
}
if (dy == nullptr) return;
const framework::DDim& x_dim = dout->dims();
framework::DDim y_dim = dy->dims();
if (x_dim == y_dim) {
// dy->ShareDataWith(*dout);
framework::TensorCopy(*dout, ctx.GetPlace(),
ctx.template device_context<DeviceContext>(), dy);
} else { } else {
dy->mutable_data<T>(ctx.GetPlace()); default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
// Perform reduction to dout to calculate dy dy);
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
y_dim = trim_trailing_singular_dims(y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
auto& device =
*(ctx.template device_context<DeviceContext>().eigen_device());
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
auto eigen_dout = framework::EigenTensor<T, 3>::From(
*dout, framework::make_ddim({pre, n, post}));
auto eigen_dy =
framework::EigenTensor<T, 1>::From(*dy, framework::make_ddim({n}));
eigen_dy.device(device) = eigen_dout.sum(
framework::EigenDim<2>::From(framework::make_ddim({0, 2})));
} }
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -12,8 +12,512 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,512 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cub/cub.cuh>
#include "paddle/fluid/operators/layer_norm_op.h" #include "paddle/fluid/operators/layer_norm_op.h"
namespace paddle {
namespace operators {
inline static int GetDesiredBlockDim(int block_dim) {
const int kMaxBlockDim = 512;
return block_dim >= kMaxBlockDim
? kMaxBlockDim
: (1 << (static_cast<int>(std::log2f(block_dim))));
}
#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \
case (1 << (log2_block_dim)): { \
constexpr auto kBlockDim = (1 << (log2_block_dim)); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM_CASE(...) \
FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__)
static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); }
static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); }
template <typename T>
struct PairForLayerNorm {
__device__ __forceinline__ PairForLayerNorm() {}
__device__ __forceinline__ PairForLayerNorm(const T &first, const T &second)
: first_(first), second_(second) {}
T first_;
T second_;
};
template <typename T>
struct PairForLayerNormAddFunctor {
__device__ __forceinline__ PairForLayerNorm<T> operator()(
const PairForLayerNorm<T> &p1, const PairForLayerNorm<T> &p2) {
return PairForLayerNorm<T>(p1.first_ + p2.first_, p1.second_ + p2.second_);
}
};
template <typename T, int BlockDim>
__global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
T *y, T *mean, T *var, float epsilon,
int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size;
// Step 1: Reduce to calculate mean and var
T mean_val = static_cast<T>(0);
T var_val = static_cast<T>(0);
for (int i = beg_idx; i < end_idx; i += BlockDim) {
T tmp = x[i];
mean_val += tmp;
var_val += (tmp * tmp);
}
auto pair = BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(mean_val, var_val),
PairForLayerNormAddFunctor<T>());
if (threadIdx.x == 0) {
auto tmp = pair.first_ / feature_size;
mean[blockIdx.x] = tmp;
var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp;
}
__syncthreads();
mean_val = mean[blockIdx.x];
var_val = static_cast<T>(real_sqrt(var[blockIdx.x] + epsilon));
// Step 2: Calculate y
if (scale != nullptr) {
if (bias != nullptr) {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = scale[j] * (x[i] - mean_val) / var_val + bias[j];
}
} else {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = scale[j] * (x[i] - mean_val) / var_val;
}
}
} else { // scale == nullptr
if (bias != nullptr) {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = (x[i] - mean_val) / var_val + bias[j];
}
} else {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = (x[i] - mean_val) / var_val;
}
}
}
}
// Make sure that d_scale != nullptr && d_bias != nullptr
// Since d_scale != nullptr, scale would not be nullptr
template <typename T, int BlockDim, bool HasDx>
__global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y,
T *d_scale, T *d_bias, T *d_x,
const T *mean, const T *var,
const T *scale, float epsilon,
int batch_size, int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = threadIdx.x * feature_size + blockIdx.x;
int end_idx = batch_size * feature_size + blockIdx.x;
int stride = BlockDim * feature_size;
T d_scale_partial = 0, d_bias_partial = 0;
for (int i = beg_idx; i < end_idx; i += stride) {
int row_idx = i / feature_size;
auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
d_bias_partial += d_y[i];
if (HasDx) {
d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
}
}
auto pair = BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(d_scale_partial, d_bias_partial),
PairForLayerNormAddFunctor<T>());
if (threadIdx.x == 0) {
d_scale[blockIdx.x] = pair.first_;
d_bias[blockIdx.x] = pair.second_;
}
}
// Make sure that there is only one true expression: d_scale != nullptr
// or d_bias != nullptr
// Notice: scale may be nullptr
template <typename T, int BlockDim, bool HasDx, bool HasDScale>
__global__ void LayerNormBackwardGradientScaleOrBias(
const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean,
const T *var, const T *scale, float epsilon, int batch_size,
int feature_size) {
using BlockReduce = cub::BlockReduce<T, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = threadIdx.x * feature_size + blockIdx.x;
int end_idx = batch_size * feature_size + blockIdx.x;
int stride = BlockDim * feature_size;
T d_scale_or_d_bias_partial = 0;
for (int i = beg_idx; i < end_idx; i += stride) {
int row_idx = i / feature_size;
auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
if (HasDScale) {
d_scale_or_d_bias_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
} else { // d_bias != nullptr
d_scale_or_d_bias_partial += d_y[i];
}
if (HasDx) {
if (scale != nullptr) {
d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
} else {
d_x[i] = d_y[i] / var_val;
}
}
}
d_scale_or_d_bias_partial =
BlockReduce(temp_storage).Reduce(d_scale_or_d_bias_partial, cub::Sum());
if (threadIdx.x == 0) {
if (HasDScale) {
d_scale[blockIdx.x] = d_scale_or_d_bias_partial;
} else {
d_bias[blockIdx.x] = d_scale_or_d_bias_partial;
}
}
}
template <typename T, int BlockDim>
__global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x,
const T *mean,
const T *var,
float epsilon,
int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ T d_x_reduce_tmp[2];
int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size;
T block_mean = mean[blockIdx.x];
T block_var = var[blockIdx.x];
T d_x_mean_partial = 0, d_x_var_partial = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
d_x_mean_partial += d_x[i];
d_x_var_partial += d_x[i] * (x[i] - block_mean);
}
auto pair =
BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(d_x_mean_partial, d_x_var_partial),
PairForLayerNormAddFunctor<T>());
if (threadIdx.x == 0) {
d_x_reduce_tmp[0] = pair.first_ / feature_size;
d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon));
}
__syncthreads();
d_x_mean_partial = d_x_reduce_tmp[0];
d_x_var_partial = d_x_reduce_tmp[1];
for (int i = beg_idx; i < end_idx; i += BlockDim) {
d_x[i] -= d_x_mean_partial;
d_x[i] -= (x[i] - block_mean) * d_x_var_partial;
}
}
// Here, we only calculate d_x
template <typename T, int BlockDim>
__global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y,
T *d_x, const T *mean,
const T *var, const T *scale,
float epsilon,
int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ T d_x_reduce_tmp[2];
int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size;
T block_mean = mean[blockIdx.x], block_var = var[blockIdx.x];
T d_x_mean_partial = 0, d_x_var_partial = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
auto var_val = static_cast<T>(real_sqrt(block_var + epsilon));
if (scale != nullptr) {
int col_idx = i % feature_size;
d_x[i] = d_y[i] * scale[col_idx] / var_val;
} else {
d_x[i] = d_y[i] / var_val;
}
d_x_mean_partial += d_x[i];
d_x_var_partial += d_x[i] * (x[i] - block_mean);
}
auto pair =
BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(d_x_mean_partial, d_x_var_partial),
PairForLayerNormAddFunctor<T>());
if (threadIdx.x == 0) {
d_x_reduce_tmp[0] = pair.first_ / feature_size;
d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon));
}
__syncthreads();
d_x_mean_partial = d_x_reduce_tmp[0];
d_x_var_partial = d_x_reduce_tmp[1];
for (int i = beg_idx; i < end_idx; i += BlockDim) {
d_x[i] -= d_x_mean_partial;
d_x[i] -= (x[i] - block_mean) * d_x_var_partial;
}
}
template <typename T>
__global__ void LayerNormBackwardWhenBatchSizeIsOne(
const T *x, const T *d_y, T *d_x, T *d_scale, T *d_bias, const T *mean,
const T *var, const T *scale, float epsilon, int feature_size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < feature_size) {
auto var_val = static_cast<T>(real_sqrt(var[idx] + epsilon));
if (d_x != nullptr) {
if (d_scale == nullptr) {
d_x[idx] = d_y[idx] / var_val;
} else {
d_x[idx] = d_y[idx] * scale[idx] / var_val;
}
}
if (d_scale != nullptr) {
d_scale[idx] = d_y[idx] * (x[idx] - mean[idx]) / var_val;
}
if (d_bias != nullptr) d_bias[idx] = d_y[idx];
}
}
template <typename T>
static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
const T *mean, const T *var, T *d_x, T *d_scale,
T *d_bias, float epsilon, int batch_size,
int feature_size, cudaStream_t stream) {
const int kMaxBlockDim = 512;
int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
((d_scale != nullptr ? 1 : 0) << 1) |
((d_bias != nullptr ? 1 : 0));
if (gradient_flag == 0) return;
if (batch_size == 1) {
LayerNormBackwardWhenBatchSizeIsOne<
T><<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0,
stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon,
feature_size);
if (d_x != nullptr) {
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX<
T, kBlockDim><<<1, kBlockDim, 0, stream>>>(
x, d_x, mean, var, epsilon, feature_size));
}
}
return;
}
auto block_dim = GetDesiredBlockDim(batch_size);
switch (gradient_flag) {
case 1: // d_x == nulptr, d_scale == nullptr, d_bias != nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
T, kBlockDim, false,
false><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
feature_size));
}
break;
case 2: // d_x == nullptr, d_scale != nullptr, d_bias == nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
T, kBlockDim, false,
true><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
feature_size));
}
break;
case 3: // d_x == nullptr, d_scale != nulptr, d_bias != nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardGradientAll<
T, kBlockDim, false><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
batch_size, feature_size));
}
break;
case 4: // d_x != nullptr, d_scale == nullptr, d_bias == nullptr
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardGradientOnlyDX<
T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x, d_y, d_x, mean, var, scale, epsilon, feature_size));
}
break;
case 5: // d_x != nulptr, d_scale == nullptr, d_bias != nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
T, kBlockDim, true,
false><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
feature_size));
}
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardPostProcessToCalculateDX<
T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x, d_x, mean, var, epsilon, feature_size));
}
break;
case 6: // d_x != nullptr, d_scale != nullptr, d_bias == nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
T, kBlockDim, true,
true><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
feature_size));
}
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardPostProcessToCalculateDX<
T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x, d_x, mean, var, epsilon, feature_size));
}
break;
case 7: // d_x != nullptr, d_scale != nullptr, d_bias != nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardGradientAll<
T, kBlockDim, true><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
batch_size, feature_size));
}
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardPostProcessToCalculateDX<
T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x, d_x, mean, var, epsilon, feature_size));
}
break;
default:
break;
}
}
template <typename T>
class LayerNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
auto *scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias");
auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance");
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const auto x_dims = x->dims();
auto *x_data = x->data<T>();
auto *y_data = y->mutable_data<T>(ctx.GetPlace());
auto *mean_data = mean->mutable_data<T>(ctx.GetPlace());
auto *var_data = var->mutable_data<T>(ctx.GetPlace());
auto *scale_data = (scale == nullptr ? nullptr : scale->data<T>());
auto *bias_data = (bias == nullptr ? nullptr : bias->data<T>());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int batch_size = static_cast<int>(matrix_dim[0]);
int feature_size = static_cast<int>(matrix_dim[1]);
auto stream = ctx.cuda_device_context().stream();
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormForward<T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x_data, scale_data, bias_data, y_data, mean_data, var_data,
epsilon, feature_size));
default:
PADDLE_THROW(
"Product from begin_norm_axis to end must be larger than 1");
break;
}
}
};
template <typename T>
class LayerNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
// d_x, d_scale, d_bias may be nullptr
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto *x = ctx.Input<Tensor>("X");
auto *mean = ctx.Input<Tensor>("Mean");
auto *var = ctx.Input<Tensor>("Variance");
auto *scale = ctx.Input<Tensor>("Scale");
auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto *x_data = x->data<T>();
auto *d_y_data = d_y->data<T>();
auto *mean_data = mean->data<T>();
auto *var_data = var->data<T>();
auto *scale_data = (scale == nullptr ? nullptr : scale->data<T>());
auto *d_scale_data =
(d_scale == nullptr ? nullptr
: d_scale->mutable_data<T>(ctx.GetPlace()));
auto *d_bias_data =
(d_bias == nullptr ? nullptr : d_bias->mutable_data<T>(ctx.GetPlace()));
auto *d_x_data =
(d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace()));
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int batch_size = static_cast<int>(matrix_dim[0]);
int feature_size = static_cast<int>(matrix_dim[1]);
auto stream = ctx.cuda_device_context().stream();
LayerNormBackward<T>(x_data, d_y_data, scale_data, mean_data, var_data,
d_x_data, d_scale_data, d_bias_data, epsilon,
batch_size, feature_size, stream);
}
};
#undef FIXED_BLOCK_DIM_CASE_BASE
#undef FIXED_BLOCK_DIM_CASE
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
layer_norm, layer_norm,
......
...@@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) { ...@@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) {
std::string ser(seriralized); std::string ser(seriralized);
self.SetAttr(name, ser); self.SetAttr(name, ser);
}) })
.def("block_attr", &pd::OpDesc::GetBlockAttr) .def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
.def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
.def("check_attrs", &pd::OpDesc::CheckAttrs) .def("check_attrs", &pd::OpDesc::CheckAttrs)
.def("infer_shape", &pd::OpDesc::InferShape) .def("infer_shape", &pd::OpDesc::InferShape)
.def("infer_var_type", &pd::OpDesc::InferVarType) .def("infer_var_type", &pd::OpDesc::InferVarType)
......
...@@ -664,7 +664,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -664,7 +664,7 @@ All parameter, weight, gradient are variables in Paddle.
const std::string &, Scope *, std::vector<Scope *> &, const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t, const ExecutionStrategy &, const BuildStrategy &, size_t,
size_t>()) size_t>())
.def("bcast_params", &ParallelExecutor::BCastParamsToDevices) .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices)
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element // We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope* // of vec<Scope*> will be freed by Python GC. We can only return Scope*
......
...@@ -48,8 +48,6 @@ from .data_feeder import DataFeeder ...@@ -48,8 +48,6 @@ from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from .transpiler import DistributeTranspiler, InferenceTranspiler, \ from .transpiler import DistributeTranspiler, InferenceTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig memory_optimize, release_memory, DistributeTranspilerConfig
from .concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close, Select)
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip from . import clip
from . import profiler from . import profiler
...@@ -61,7 +59,7 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable ...@@ -61,7 +59,7 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ __all__ = framework.__all__ + executor.__all__ + \
trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + lod_tensor.__all__ + [ parallel_executor.__all__ + lod_tensor.__all__ + [
'io', 'io',
......
...@@ -344,7 +344,7 @@ def _append_backward_ops_(block, ...@@ -344,7 +344,7 @@ def _append_backward_ops_(block,
grad_sub_block_list = [] grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first # If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"): if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr("sub_block")) sub_block = program.block(op.block_attr_id("sub_block"))
grad_sub_block = program.create_block() grad_sub_block = program.create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx) grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op) cb = _callback_lookup_(op)
...@@ -406,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -406,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
for op_idx in range(start_op_idx, block.desc.op_size()): for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx) op_desc = block.desc.op(op_idx)
if op_desc.has_attr("sub_block"): if op_desc.has_attr("sub_block"):
sub_block = block.program.block(op_desc.block_attr("sub_block")) sub_block = block.program.block(op_desc.block_attr_id("sub_block"))
_append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
new_vars = set() new_vars = set()
# create new gradient variables # create new gradient variables
......
...@@ -19,8 +19,7 @@ from .layers import fill_constant ...@@ -19,8 +19,7 @@ from .layers import fill_constant
from . import core from . import core
__all__ = [ __all__ = [
'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select'
'Select'
] ]
...@@ -35,10 +34,10 @@ class Go(BlockGuard): ...@@ -35,10 +34,10 @@ class Go(BlockGuard):
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None: if exc_type is not None:
return False return False
self.construct_go_op() self._construct_go_op()
return super(Go, self).__exit__(exc_type, exc_val, exc_tb) return super(Go, self).__exit__(exc_type, exc_val, exc_tb)
def construct_go_op(self): def _construct_go_op(self):
main_program = self.helper.main_program main_program = self.helper.main_program
go_block = main_program.current_block() go_block = main_program.current_block()
parent_block = main_program.block(main_program.current_block() parent_block = main_program.block(main_program.current_block()
......
...@@ -18,9 +18,7 @@ import six ...@@ -18,9 +18,7 @@ import six
from .framework import Program, default_main_program, Variable from .framework import Program, default_main_program, Variable
from . import core from . import core
__all__ = [ __all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope']
'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var'
]
g_scope = core.Scope() g_scope = core.Scope()
...@@ -171,7 +169,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): ...@@ -171,7 +169,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
return fetch_count > 0 return fetch_count > 0
def fetch_var(name, scope=None, return_numpy=True): def _fetch_var(name, scope=None, return_numpy=True):
""" """
Fetch the value of the variable with the given name from the Fetch the value of the variable with the given name from the
given scope. given scope.
...@@ -222,6 +220,37 @@ def _get_program_cache_key(feed, fetch_list): ...@@ -222,6 +220,37 @@ def _get_program_cache_key(feed, fetch_list):
return str(feed_var_names + fetch_var_names) return str(feed_var_names + fetch_var_names)
def _as_lodtensor(data, place):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> place = fluid.CPUPlace()
>>> exe = fluid.executor(place)
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \
ndarray to LoDTensor. Please convert data to LoDTensor \
directly before feeding the data.\
")
# single tensor case
tensor = core.LoDTensor()
tensor.set(data, place)
return tensor
class Executor(object): class Executor(object):
""" """
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
...@@ -250,35 +279,6 @@ class Executor(object): ...@@ -250,35 +279,6 @@ class Executor(object):
self.program_caches = dict() self.program_caches = dict()
self._closed = False self._closed = False
def as_lodtensor(self, data):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> exe = fluid.executor(fluid.CPUPlace())
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \
ndarray to LoDTensor. Please convert data to LoDTensor \
directly before feeding the data.\
")
# single tensor case
tensor = core.LoDTensor()
tensor.set(data, self.place)
return tensor
def _get_program_cache(self, program_cache_key): def _get_program_cache(self, program_cache_key):
return self.program_caches.get(program_cache_key, None) return self.program_caches.get(program_cache_key, None)
...@@ -337,7 +337,7 @@ class Executor(object): ...@@ -337,7 +337,7 @@ class Executor(object):
feed_target_name = op.desc.output('Out')[0] feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name] cur_feed = feed[feed_target_name]
if not isinstance(cur_feed, core.LoDTensor): if not isinstance(cur_feed, core.LoDTensor):
cur_feed = self.as_lodtensor(cur_feed) cur_feed = _as_lodtensor(cur_feed, self.place)
idx = op.desc.attr('col') idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx) core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else: else:
......
...@@ -476,23 +476,25 @@ class Operator(object): ...@@ -476,23 +476,25 @@ class Operator(object):
attrs=None): attrs=None):
self.block = block self.block = block
self.desc = desc self.desc = desc
self.attrs = attrs # note: not add self.attrs here:
if self.attrs is None: # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
self.attrs = dict() op_attrs = attrs
if op_attrs is None:
op_attrs = dict()
del attrs del attrs
op_maker = core.op_proto_and_checker_maker op_maker = core.op_proto_and_checker_maker
if op_maker.kOpRoleAttrName() not in self.attrs: if op_maker.kOpRoleAttrName() not in op_attrs:
self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role op_attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
role_var_name = op_maker.kOpRoleVarAttrName() role_var_name = op_maker.kOpRoleVarAttrName()
if len(self.block.program. if len(self.block.program.
op_role_var) != 0 and role_var_name not in self.attrs: op_role_var) != 0 and role_var_name not in op_attrs:
self.attrs[role_var_name] = self.block.program.op_role_var op_attrs[role_var_name] = self.block.program.op_role_var
if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0: if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
del self.attrs[role_var_name] del op_attrs[role_var_name]
if len(self.desc.type()) != 0: if len(self.desc.type()) != 0:
return return
...@@ -576,15 +578,14 @@ class Operator(object): ...@@ -576,15 +578,14 @@ class Operator(object):
arg.op = self arg.op = self
self.desc.set_output(out_proto.name, out_arg_names) self.desc.set_output(out_proto.name, out_arg_names)
if self.attrs is not None: if op_attrs is not None:
if not isinstance(self.attrs, dict): if not isinstance(op_attrs, dict):
raise TypeError("'attrs' should be a dict.") raise TypeError("'attrs' should be a dict.")
for attr in proto.attrs: for attr in proto.attrs:
attr_name = attr.name attr_name = attr.name
if (attr_name not in self.attrs) or ( if (attr_name not in op_attrs) or (op_attrs[attr_name] is None):
self.attrs[attr_name] is None):
continue continue
attr_val = self.attrs[attr_name] attr_val = op_attrs[attr_name]
self._update_desc_attr(attr_name, attr_val) self._update_desc_attr(attr_name, attr_val)
self.desc.check_attrs() self.desc.check_attrs()
...@@ -732,7 +733,6 @@ class Operator(object): ...@@ -732,7 +733,6 @@ class Operator(object):
Raises: Raises:
ValueError: If the type of value doesn't match with desc.attr_type(name). ValueError: If the type of value doesn't match with desc.attr_type(name).
""" """
self.attrs[name] = val
self._update_desc_attr(name, val) self._update_desc_attr(name, val)
def _update_desc_attr(self, name, val): def _update_desc_attr(self, name, val):
...@@ -774,9 +774,9 @@ class Operator(object): ...@@ -774,9 +774,9 @@ class Operator(object):
""" """
return self.desc.attr(name) return self.desc.attr(name)
def block_attr(self, name): def block_attr_id(self, name):
""" """
Get the block attribute by name. Get the block attribute's id by name.
Args: Args:
name(str): the attribute name. name(str): the attribute name.
...@@ -784,22 +784,74 @@ class Operator(object): ...@@ -784,22 +784,74 @@ class Operator(object):
Returns: Returns:
int: the block index. int: the block index.
""" """
return self.desc.block_attr(name) return self.desc.block_attr_id(name)
def block_attr(self, name):
"""
Get the block attribute by name.
Args:
name(str): the attribute name.
Returns:
block: the block attribute.
"""
id = self.block_attr_id(name)
assert (id >= 0 and id < len(self.block.program.blocks))
return self.block.program.blocks[id]
def blocks_attr(self, name):
"""
Get the blocks attribute by name.
Args:
name(str): the attribute name.
Returns:
list: list of the blocks attribute.
"""
attrs = []
for i in self.blocks_attr_ids(name):
assert (i >= 0 and i < len(self.block.program.blocks))
attrs.append(self.block.program.blocks[i])
return attrs
def blocks_attr_ids(self, name):
"""
Get the blocks attribute's ids by name.
Args:
name(str): the attribute name.
Returns:
list: list of the blocks ids.
"""
return self.desc.blocks_attr_ids(name)
def all_attrs(self): def all_attrs(self):
""" """
Get the attribute dict. Get the attribute dict.
Returns: Returns:
dict: The Operator's attribute dict. dict: The Operator's attribute dict, name->attr.
""" """
attr_names = self.attr_names attr_names = self.attr_names
attr_map = {} attr_map = {}
for n in attr_names: for n in attr_names:
if n == 'sub_block': attr_type = self.desc.attr_type(n)
if attr_type == core.AttrType.BLOCK:
attr_map[n] = self.block_attr(n) attr_map[n] = self.block_attr(n)
else: continue
attr_map[n] = self.attr(n)
if attr_type == core.AttrType.BLOCKS:
attr_map[n] = self.blocks_attr(n)
continue
attr_map[n] = self.attr(n)
return attr_map return attr_map
...@@ -1518,11 +1570,17 @@ class Program(object): ...@@ -1518,11 +1570,17 @@ class Program(object):
The two code snippets above will generate same programs. The two code snippets above will generate same programs.
""" """
if for_test: if for_test:
p = self.inference_optimize() p = self.inference_optimize(export_for_deployment=False)
else: else:
p = Program() p = Program()
p.current_block_idx = self.current_block_idx
p._seed = self._seed
p.desc = core.ProgramDesc(self.desc) p.desc = core.ProgramDesc(self.desc)
p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())] p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
p._current_role = self._current_role
p._op_role_var = self._op_role_var
p._sync_with_cpp() p._sync_with_cpp()
p._copy_param_info_from(self) p._copy_param_info_from(self)
...@@ -1578,7 +1636,7 @@ class Program(object): ...@@ -1578,7 +1636,7 @@ class Program(object):
res._sync_with_cpp() res._sync_with_cpp()
return res return res
def inference_optimize(self): def inference_optimize(self, export_for_deployment=True):
""" """
This method will create a new program and do following adjustments on it: This method will create a new program and do following adjustments on it:
1. Remove all reader variables and their creator ops if exist. 1. Remove all reader variables and their creator ops if exist.
...@@ -1589,6 +1647,10 @@ class Program(object): ...@@ -1589,6 +1647,10 @@ class Program(object):
attribute of operators to :code:`True`. All the :code:`Parameter` attribute of operators to :code:`True`. All the :code:`Parameter`
information will be lost. information will be lost.
Args:
export_for_deployment(bool): remove the read ops that are added by py_reader
for cpp inference library
Notes: This API is a very low level API. Use Notes: This API is a very low level API. Use
:code:`Program.clone(for_test=True)` instead. :code:`Program.clone(for_test=True)` instead.
...@@ -1603,16 +1665,17 @@ class Program(object): ...@@ -1603,16 +1665,17 @@ class Program(object):
# remove all readers and the read_op if exist # remove all readers and the read_op if exist
read_op_idx = 0 read_op_idx = 0
root_block = res.desc.block(0) root_block = res.desc.block(0)
while True: if export_for_deployment:
if read_op_idx >= root_block.op_size() or root_block.op( while True:
read_op_idx).type() == 'read': if read_op_idx >= root_block.op_size() or root_block.op(
break read_op_idx).type() == 'read':
read_op_idx += 1 break
if read_op_idx < root_block.op_size(): read_op_idx += 1
root_block._remove_op(0, read_op_idx + 1) if read_op_idx < root_block.op_size():
for var in root_block.all_vars(): root_block._remove_op(0, read_op_idx + 1)
if var.type() == core.VarDesc.VarType.READER: for var in root_block.all_vars():
root_block._remove_var(var.name()) if var.type() == core.VarDesc.VarType.READER:
root_block._remove_var(var.name())
# change all `is_test` attributes to True # change all `is_test` attributes to True
for i in range(res.desc.num_blocks()): for i in range(res.desc.num_blocks()):
......
...@@ -264,7 +264,8 @@ class NormalInitializer(Initializer): ...@@ -264,7 +264,8 @@ class NormalInitializer(Initializer):
"dtype": int(var.dtype), "dtype": int(var.dtype),
"mean": self._mean, "mean": self._mean,
"std": self._std_dev, "std": self._std_dev,
"seed": self._seed "seed": self._seed,
"use_mkldnn": False
}) })
var.op = op var.op = op
return op return op
......
...@@ -555,7 +555,8 @@ def save_inference_model(dirname, ...@@ -555,7 +555,8 @@ def save_inference_model(dirname,
executor, executor,
main_program=None, main_program=None,
model_filename=None, model_filename=None,
params_filename=None): params_filename=None,
export_for_deployment=True):
""" """
Prune the given `main_program` to build a new program especially for inference, Prune the given `main_program` to build a new program especially for inference,
and then save it and all related parameters to given `dirname` by the `executor`. and then save it and all related parameters to given `dirname` by the `executor`.
...@@ -577,6 +578,8 @@ def save_inference_model(dirname, ...@@ -577,6 +578,8 @@ def save_inference_model(dirname,
params_filename(str|None): The name of file to save all related parameters. params_filename(str|None): The name of file to save all related parameters.
If it is setted None, parameters will be saved If it is setted None, parameters will be saved
in separate files . in separate files .
export_for_deployment(bool): remove the read ops that are added by py_reader
for cpp inference lib. Default True
Returns: Returns:
None None
...@@ -643,7 +646,8 @@ def save_inference_model(dirname, ...@@ -643,7 +646,8 @@ def save_inference_model(dirname,
copy_program.desc.flush() copy_program.desc.flush()
pruned_program = copy_program.prune(targets=target_vars) pruned_program = copy_program.prune(targets=target_vars)
inference_program = pruned_program.inference_optimize() inference_program = pruned_program.inference_optimize(
export_for_deployment=export_for_deployment)
fetch_var_names = [v.name for v in target_vars] fetch_var_names = [v.name for v in target_vars]
prepend_feed_ops(inference_program, feeded_var_names) prepend_feed_ops(inference_program, feeded_var_names)
......
...@@ -164,7 +164,7 @@ def rpn_target_assign(loc, ...@@ -164,7 +164,7 @@ def rpn_target_assign(loc,
}) })
# 4. Reshape and gather the target entry # 4. Reshape and gather the target entry
scores = nn.reshape(x=scores, shape=(-1, 1)) scores = nn.reshape(x=scores, shape=(-1, 2))
loc = nn.reshape(x=loc, shape=(-1, 4)) loc = nn.reshape(x=loc, shape=(-1, 4))
target_label = nn.reshape(x=target_label, shape=(-1, 1)) target_label = nn.reshape(x=target_label, shape=(-1, 1))
target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4)) target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4))
......
...@@ -273,19 +273,19 @@ class ParallelExecutor(object): ...@@ -273,19 +273,19 @@ class ParallelExecutor(object):
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if self.is_dist: if self.is_dist:
self.bcast_params() self._bcast_params()
if return_numpy: if return_numpy:
return executor.as_numpy(arr) return executor.as_numpy(arr)
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
def bcast_params(self): def _bcast_params(self):
""" """
Broadcast the parameters to other devices. It is used during Broadcast the parameters to other devices. It is used during
distributed training. distributed training.
""" """
self.executor.bcast_params(set(self.persistable_vars)) self.executor._bcast_params(set(self.persistable_vars))
@property @property
def device_count(self): def device_count(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
import collections
SEED = 1
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# random seed must set before configuring the network.
# fluid.default_startup_program().random_seed = SEED
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
# TODO(dzhwinter) : refine the initializer and random seed settting
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)))
return predict
def get_model(batch_size):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
opt.minimize(avg_cost)
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
program=main_program,
pservers=pserver_endpoints,
trainers=trainers)
return t
def operator_equal(a, b):
for k, v in a.__dict__.iteritems():
if isinstance(v, fluid.framework.Program) or \
isinstance(v, fluid.framework.Block):
continue
elif isinstance(v, core.OpDesc):
if v.serialize_to_string() != b.__dict__[k].serialize_to_string():
raise ValueError("In operator_equal not equal:{0}\n".format(k))
elif isinstance(v, collections.OrderedDict):
v0 = sorted(v.iteritems(), key=lambda x: x[0])
v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
if v0 != v1:
raise ValueError("In operator_equal not equal:{0}\n".format(k))
elif (v != b.__dict__[k]):
raise ValueError("In operator_equal not equal:{0}\n".format(k))
return True
def block_equal(a, b):
for k, v in a.__dict__.iteritems():
if isinstance(v, core.ProgramDesc) or isinstance(
v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
continue
elif k == "ops":
for i in range(0, len(a.ops)):
if not operator_equal(a.ops[i], b.ops[i]):
raise ValueError("In block_equal not equal:{0}\n".format(k))
assert (len(a.ops) == len(b.ops))
elif isinstance(v, collections.OrderedDict):
v0 = sorted(v.iteritems(), key=lambda x: x[0])
v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
if v0 != v1:
raise ValueError("In block_equal not equal:{0}\n".format(k))
elif (v != b.__dict__[k]):
raise ValueError("In block_equal not equal:{0}\n".format(k))
return True
def program_equal(a, b):
for k, v in a.__dict__.iteritems():
if isinstance(v, core.ProgramDesc):
continue
elif k == 'blocks':
for i in range(0, len(a.blocks)):
if not block_equal(a.blocks[i], b.blocks[i]):
raise ValueError("In operator_equal not equal:{0}\n".format(
k))
return False
assert (len(a.blocks) == len(b.blocks))
elif (v != b.__dict__[k]):
raise ValueError("In program_equal not equal:{0}\n".format(k))
return True
class TestDistMnist(unittest.TestCase):
def test_desc_clone(self):
get_model(batch_size=20)
pserver_endpoints = "127.0.0.1:9123"
trainers = 1
current_endpoint = "127.0.0.1:9123"
t = get_transpiler(0,
fluid.default_main_program(), pserver_endpoints,
trainers)
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
main = pserver_prog.clone()
startup = startup_prog.clone()
self.assertTrue(program_equal(main, pserver_prog))
self.assertTrue(program_equal(startup, startup_prog))
if __name__ == "__main__":
unittest.main()
...@@ -130,7 +130,7 @@ class TestDistBase(unittest.TestCase): ...@@ -130,7 +130,7 @@ class TestDistBase(unittest.TestCase):
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._python_interp = "python" self._python_interp = "python"
def start_pserver(self, model_file): def start_pserver(self, model_file, check_error_log):
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep, (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
...@@ -139,11 +139,23 @@ class TestDistBase(unittest.TestCase): ...@@ -139,11 +139,23 @@ class TestDistBase(unittest.TestCase):
(self._python_interp, model_file, self._ps_endpoints, ps1_ep, (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers) self._trainers)
ps0_pipe = subprocess.PIPE
ps1_pipe = subprocess.PIPE
if check_error_log:
print("ps0_cmd:", ps0_cmd)
print("ps1_cmd:", ps1_cmd)
ps0_pipe = open("/tmp/ps0_err.log", "wb")
ps1_pipe = open("/tmp/ps1_err.log", "wb")
ps0_proc = subprocess.Popen( ps0_proc = subprocess.Popen(
ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
ps1_proc = subprocess.Popen( ps1_proc = subprocess.Popen(
ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
return ps0_proc, ps1_proc
if not check_error_log:
return ps0_proc, ps1_proc, None, None
else:
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
def _wait_ps_ready(self, pid): def _wait_ps_ready(self, pid):
retry_times = 50 retry_times = 50
...@@ -160,7 +172,7 @@ class TestDistBase(unittest.TestCase): ...@@ -160,7 +172,7 @@ class TestDistBase(unittest.TestCase):
(e, retry_times)) (e, retry_times))
retry_times -= 1 retry_times -= 1
def check_with_place(self, model_file, delta=1e-3): def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs = { required_envs = {
"PATH": os.getenv("PATH"), "PATH": os.getenv("PATH"),
...@@ -169,17 +181,32 @@ class TestDistBase(unittest.TestCase): ...@@ -169,17 +181,32 @@ class TestDistBase(unittest.TestCase):
"FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1" "FLAGS_cudnn_deterministic": "1"
} }
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
# Run local to get a base line # Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs) env_local.update(required_envs)
local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \ local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
(self._python_interp, model_file, (self._python_interp, model_file,
"127.0.0.1:1234", "127.0.0.1:1234", 1) "127.0.0.1:1234", "127.0.0.1:1234", 1)
local_proc = subprocess.Popen( if not check_error_log:
local_cmd.split(" "), local_proc = subprocess.Popen(
stdout=subprocess.PIPE, local_cmd.split(" "),
stderr=subprocess.PIPE, stdout=subprocess.PIPE,
env=env_local) stderr=subprocess.PIPE,
env=env_local)
else:
print("trainer cmd:", local_cmd)
err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen(
local_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=err_log,
env=env_local)
local_proc.wait() local_proc.wait()
out, err = local_proc.communicate() out, err = local_proc.communicate()
local_ret = out local_ret = out
...@@ -187,7 +214,8 @@ class TestDistBase(unittest.TestCase): ...@@ -187,7 +214,8 @@ class TestDistBase(unittest.TestCase):
sys.stderr.write('local_stderr: %s\n' % err) sys.stderr.write('local_stderr: %s\n' % err)
# Run dist train to compare with local results # Run dist train to compare with local results
ps0, ps1 = self.start_pserver(model_file) ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
check_error_log)
self._wait_ps_ready(ps0.pid) self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid) self._wait_ps_ready(ps1.pid)
...@@ -205,15 +233,23 @@ class TestDistBase(unittest.TestCase): ...@@ -205,15 +233,23 @@ class TestDistBase(unittest.TestCase):
env1.update(required_envs) env1.update(required_envs)
FNULL = open(os.devnull, 'w') FNULL = open(os.devnull, 'w')
tr0_pipe = subprocess.PIPE
tr1_pipe = subprocess.PIPE
if check_error_log:
print("tr0_cmd:", tr0_cmd)
print("tr1_cmd:", tr1_cmd)
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
tr0_proc = subprocess.Popen( tr0_proc = subprocess.Popen(
tr0_cmd.split(" "), tr0_cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=tr0_pipe,
env=env0) env=env0)
tr1_proc = subprocess.Popen( tr1_proc = subprocess.Popen(
tr1_cmd.split(" "), tr1_cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=tr1_pipe,
env=env1) env=env1)
tr0_proc.wait() tr0_proc.wait()
...@@ -230,6 +266,13 @@ class TestDistBase(unittest.TestCase): ...@@ -230,6 +266,13 @@ class TestDistBase(unittest.TestCase):
local_first_loss = eval(local_lines[0])[0] local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0] local_last_loss = eval(local_lines[1])[0]
# close trainer file
if check_error_log:
tr0_pipe.close()
tr1_pipe.close()
ps0_pipe.close()
ps1_pipe.close()
# FIXME: use terminate() instead of sigkill. # FIXME: use terminate() instead of sigkill.
os.kill(ps0.pid, signal.SIGKILL) os.kill(ps0.pid, signal.SIGKILL)
os.kill(ps1.pid, signal.SIGKILL) os.kill(ps1.pid, signal.SIGKILL)
......
...@@ -259,7 +259,7 @@ class TestLRDecayConditional(TranspilerTest): ...@@ -259,7 +259,7 @@ class TestLRDecayConditional(TranspilerTest):
serv_op = pserver.blocks[0].ops[0] serv_op = pserver.blocks[0].ops[0]
sub_blocks = [] sub_blocks = []
optimize_blocks = [] optimize_blocks = []
for b in serv_op.attrs["optimize_blocks"]: for b in serv_op.all_attrs()["optimize_blocks"]:
optimize_blocks.append(b.idx) optimize_blocks.append(b.idx)
for b in pserver.blocks: for b in pserver.blocks:
if b.idx not in optimize_blocks: if b.idx not in optimize_blocks:
...@@ -536,5 +536,35 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ...@@ -536,5 +536,35 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
class TestRMSPropOptimizer(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
optimizer.minimize(avg_cost)
return
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
pserver2, startup2 = self.get_pserver(self.pserver2_ep)
self.assertEqual(len(pserver.blocks), 3)
# block1~2: optimize pass
self.assertEqual([op.type for op in pserver.blocks[1].ops],
["sum", "scale", "rmsprop"])
# the variable #fc_w will be split into two blocks
fc_w_var = startup.global_block().var("fc_w.block1")
self.assertEqual(fc_w_var.shape, (500, 1000))
moment_var = startup.global_block().var("momentum_1")
self.assertEqual(moment_var.shape, (500, 1000))
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -26,7 +26,7 @@ class TestFetchVar(op_test.OpTest): ...@@ -26,7 +26,7 @@ class TestFetchVar(op_test.OpTest):
layers.assign(input=val, output=x) layers.assign(input=val, output=x)
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_main_program(), feed={}, fetch_list=[]) exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
fetched_x = fluid.fetch_var("x") fetched_x = fluid.executor._fetch_var("x")
self.assertTrue( self.assertTrue(
numpy.array_equal(fetched_x, val), numpy.array_equal(fetched_x, val),
"fetch_x=%s val=%s" % (fetched_x, val)) "fetch_x=%s val=%s" % (fetched_x, val))
......
...@@ -17,6 +17,7 @@ import unittest ...@@ -17,6 +17,7 @@ import unittest
from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid as fluid
main_program = default_main_program() main_program = default_main_program()
...@@ -98,6 +99,39 @@ class TestProgram(unittest.TestCase): ...@@ -98,6 +99,39 @@ class TestProgram(unittest.TestCase):
new_program = main_program.clone() new_program = main_program.clone()
self.assertNotEqual(0, len(new_program.blocks[0].all_parameters())) self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
def test_program_inference_optimize(self):
def net():
reader = fluid.layers.py_reader(
capacity=10,
shapes=[[-1, 10], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
use_double_buffer=True)
in_data, label = fluid.layers.read_file(reader)
predict_label = fluid.layers.fc(in_data, size=2, act='softmax')
loss = fluid.layers.mean(
fluid.layers.cross_entropy(
input=predict_label, label=label))
optimizer = fluid.optimizer.Adam()
optimizer.minimize(loss)
startup_program = fluid.Program()
main_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
net()
no_read_program = main_program.inference_optimize()
keep_read_program = main_program.inference_optimize(
export_for_deployment=False)
no_read_ops = no_read_program.global_block().ops
keep_read_ops = keep_read_program.global_block().ops
self.assertEqual(len(keep_read_ops) - len(no_read_ops), 2)
self.assertEqual(keep_read_ops[0].type, 'create_double_buffer_reader')
self.assertEqual(keep_read_ops[1].type, 'read')
for i in range(len(no_read_ops)):
self.assertEqual(no_read_ops[i].type, keep_read_ops[i + 2].type)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -68,7 +68,7 @@ class TestOpDesc(unittest.TestCase): ...@@ -68,7 +68,7 @@ class TestOpDesc(unittest.TestCase):
self.assertEqual(8, len(op.attr_names())) self.assertEqual(8, len(op.attr_names()))
op.set_block_attr("block_attr", program_desc.block(0)) op.set_block_attr("block_attr", program_desc.block(0))
self.assertEqual(0, op.block_attr("block_attr")) self.assertEqual(0, op.block_attr_id("block_attr"))
mul_op = block.append_op() mul_op = block.append_op()
mul_op.set_type("mul") mul_op.set_type("mul")
......
...@@ -62,7 +62,8 @@ class TestPyReader(unittest.TestCase): ...@@ -62,7 +62,8 @@ class TestPyReader(unittest.TestCase):
next_data = np.random.uniform( next_data = np.random.uniform(
low=0, high=1000, low=0, high=1000,
size=(batch_size, ) + shape[1:]).astype(dtype) size=(batch_size, ) + shape[1:]).astype(dtype)
in_data.append(executor.as_lodtensor(next_data)) in_data.append(
fluid.executor._as_lodtensor(next_data, place))
self.inputs.append(in_data) self.inputs.append(in_data)
......
...@@ -22,7 +22,7 @@ pos_enc_param_names = ( ...@@ -22,7 +22,7 @@ pos_enc_param_names = (
"src_pos_enc_table", "src_pos_enc_table",
"trg_pos_enc_table", ) "trg_pos_enc_table", )
batch_size = 64 batch_size = 2
def position_encoding_init(n_position, d_pos_vec): def position_encoding_init(n_position, d_pos_vec):
......
...@@ -584,12 +584,12 @@ class DistributeTranspiler(object): ...@@ -584,12 +584,12 @@ class DistributeTranspiler(object):
if op.type in [ if op.type in [
"gaussian_random", "fill_constant", "uniform_random" "gaussian_random", "fill_constant", "uniform_random"
]: ]:
op.attrs["shape"] = new_outputs["Out"].shape op.set_attr("shape", list(new_outputs["Out"].shape))
s_prog.global_block().append_op( s_prog.global_block().append_op(
type=op.type, type=op.type,
inputs=new_inputs, inputs=new_inputs,
outputs=new_outputs, outputs=new_outputs,
attrs=op.attrs) attrs=op.all_attrs())
return s_prog return s_prog
# ====================== private transpiler functions ===================== # ====================== private transpiler functions =====================
...@@ -603,7 +603,7 @@ class DistributeTranspiler(object): ...@@ -603,7 +603,7 @@ class DistributeTranspiler(object):
self.table_name = None self.table_name = None
for op in self.origin_program.global_block().ops: for op in self.origin_program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE: if op.type == LOOKUP_TABLE_TYPE:
if op.attrs['is_distributed'] is True: if op.attr('is_distributed') is True:
if self.table_name is None: if self.table_name is None:
self.table_name = op.input("W")[0] self.table_name = op.input("W")[0]
if self.table_name != op.input("W")[0]: if self.table_name != op.input("W")[0]:
...@@ -749,14 +749,14 @@ class DistributeTranspiler(object): ...@@ -749,14 +749,14 @@ class DistributeTranspiler(object):
out_name = op.output("Out") out_name = op.output("Out")
ids_var = program.global_block().vars[ids_name[0]] ids_var = program.global_block().vars[ids_name[0]]
prefetch_input_vars = self.create_splited_vars( prefetch_input_vars = self._create_splited_vars(
source_var=ids_var, source_var=ids_var,
block=program.global_block(), block=program.global_block(),
tag="_prefetch_in_") tag="_prefetch_in_")
self.all_prefetch_input_vars.append(prefetch_input_vars) self.all_prefetch_input_vars.append(prefetch_input_vars)
out_var = program.global_block().vars[out_name[0]] out_var = program.global_block().vars[out_name[0]]
prefetch_output_vars = self.create_splited_vars( prefetch_output_vars = self._create_splited_vars(
source_var=out_var, source_var=out_var,
block=program.global_block(), block=program.global_block(),
tag="_prefetch_out_") tag="_prefetch_out_")
...@@ -1038,7 +1038,7 @@ class DistributeTranspiler(object): ...@@ -1038,7 +1038,7 @@ class DistributeTranspiler(object):
program.global_block()._sync_with_cpp() program.global_block()._sync_with_cpp()
return var_mapping return var_mapping
def create_splited_vars(self, source_var, block, tag): def _create_splited_vars(self, source_var, block, tag):
return [ return [
block.create_var( block.create_var(
name=str(source_var.name + tag + str(index)), name=str(source_var.name + tag + str(index)),
...@@ -1182,18 +1182,39 @@ class DistributeTranspiler(object): ...@@ -1182,18 +1182,39 @@ class DistributeTranspiler(object):
program = optimize_block.program program = optimize_block.program
pserver_block = program.global_block() pserver_block = program.global_block()
new_inputs = dict() new_inputs = dict()
# update param/grad shape first, then other inputs like # update param/grad shape first, then other inputs like
# moment can use the updated shape # moment can use the updated shape
def _get_param_block(opt_op):
# param is already created on global program
param_block = None
for p in self.param_grad_ep_mapping[endpoint]["params"]:
if same_or_split_var(p.name, opt_op.input("Param")[0]):
param_block = p
break
return param_block
for key in opt_op.input_names: for key in opt_op.input_names:
if key == "Grad": if key == "Grad":
new_inputs[key] = merged_var new_inputs[key] = merged_var
# For RMSProp optimizer
elif key == "Moment" or key == "MeanSquare":
param_block = _get_param_block(opt_op)
if not param_block:
return
moment_var = origin_program.global_block().vars[opt_op.input(
key)[0]]
tmpvar = pserver_block.create_var(
name=moment_var.name,
persistable=moment_var.persistable,
dtype=moment_var.dtype,
# change to use same shape as param
# TODO(typhoonzero): didn't append .block in the var name,
# may affect checkpoint saving? Need to verify.
shape=param_block.shape)
new_inputs[key] = tmpvar
elif key == "Param": elif key == "Param":
# param is already created on global program param_block = _get_param_block(opt_op)
param_block = None
for p in self.param_grad_ep_mapping[endpoint]["params"]:
if same_or_split_var(p.name, opt_op.input(key)[0]):
param_block = p
break
if not param_block: if not param_block:
return return
tmpvar = pserver_block.create_var( tmpvar = pserver_block.create_var(
...@@ -1219,7 +1240,7 @@ class DistributeTranspiler(object): ...@@ -1219,7 +1240,7 @@ class DistributeTranspiler(object):
for key in opt_op.input_names: for key in opt_op.input_names:
new_shape = None new_shape = None
if key in ["Param", "Grad", "LearningRate"]: if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]:
continue continue
var = self.origin_program.global_block().vars[opt_op.input(key)[0]] var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
# update accumulator variable shape # update accumulator variable shape
...@@ -1242,7 +1263,7 @@ class DistributeTranspiler(object): ...@@ -1242,7 +1263,7 @@ class DistributeTranspiler(object):
type=opt_op.type, type=opt_op.type,
inputs=new_inputs, inputs=new_inputs,
outputs=outputs, outputs=outputs,
attrs=opt_op.attrs) attrs=opt_op.all_attrs())
def _is_splited_grad_var(self, var, var_dict): def _is_splited_grad_var(self, var, var_dict):
grad_block = None grad_block = None
...@@ -1275,7 +1296,7 @@ class DistributeTranspiler(object): ...@@ -1275,7 +1296,7 @@ class DistributeTranspiler(object):
block._clone_variable(var) block._clone_variable(var)
return block.append_op( return block.append_op(
type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs) type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
def _append_pserver_non_opt_ops(self, optimize_block, opt_op): def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
program = optimize_block.program program = optimize_block.program
...@@ -1316,7 +1337,7 @@ class DistributeTranspiler(object): ...@@ -1316,7 +1337,7 @@ class DistributeTranspiler(object):
type=opt_op.type, type=opt_op.type,
inputs=inputs, inputs=inputs,
outputs=outputs, outputs=outputs,
attrs=opt_op.attrs) attrs=opt_op.all_attrs())
def _is_op_connected(self, op1, op2): def _is_op_connected(self, op1, op2):
# If one op's input is another op's output or # If one op's input is another op's output or
...@@ -1421,8 +1442,8 @@ class DistributeTranspiler(object): ...@@ -1421,8 +1442,8 @@ class DistributeTranspiler(object):
# optimize # optimize
op_maker = core.op_proto_and_checker_maker op_maker = core.op_proto_and_checker_maker
optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
if op_maker.kOpRoleAttrName() in op.attrs and \ if op_maker.kOpRoleAttrName() in op.attr_names and \
int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role): int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
return True return True
return False return False
...@@ -1445,8 +1466,8 @@ class DistributeTranspiler(object): ...@@ -1445,8 +1466,8 @@ class DistributeTranspiler(object):
# and op_role_var to get the pair. # and op_role_var to get the pair.
for input_name in op.input_arg_names: for input_name in op.input_arg_names:
if input_name.find("@GRAD") != -1 and \ if input_name.find("@GRAD") != -1 and \
op.attrs[RPC_OP_ROLE_ATTR_NAME]: op.attr(RPC_OP_ROLE_ATTR_NAME):
param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0] param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
params_grads.append([ params_grads.append([
origin_var_dict[param_name], origin_var_dict[param_name],
origin_var_dict[input_name] origin_var_dict[input_name]
......
...@@ -57,10 +57,10 @@ class InferenceTranspiler(object): ...@@ -57,10 +57,10 @@ class InferenceTranspiler(object):
scope = global_scope() scope = global_scope()
if not isinstance(scope, core.Scope): if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
self.fuse_batch_norm(program, place, scope) self._fuse_batch_norm(program, place, scope)
self.fuse_relu_mkldnn(program) self._fuse_relu_mkldnn(program)
def fuse_relu_mkldnn(self, program): def _fuse_relu_mkldnn(self, program):
''' '''
Transpile the program by fused relu activation for MKLDNN program. Transpile the program by fused relu activation for MKLDNN program.
...@@ -104,7 +104,7 @@ class InferenceTranspiler(object): ...@@ -104,7 +104,7 @@ class InferenceTranspiler(object):
# And a better solution will be considered later. # And a better solution will be considered later.
program = program.clone() program = program.clone()
def fuse_batch_norm(self, program, place, scope): def _fuse_batch_norm(self, program, place, scope):
''' '''
Transpile the program by fused batch normalization. Transpile the program by fused batch normalization.
......
requests==2.9.2 requests==2.9.2
numpy>=1.12 numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
protobuf==3.1 protobuf==3.1
recordio>=0.1.0 recordio>=0.1.0
matplotlib matplotlib
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册