diff --git a/CMakeLists.txt b/CMakeLists.txt index f56c5d382af8cdfb5a941ee272a0f8d22ec04d67..920c20d6f813c14df8e02593b1c4a5a13cc11ef0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,12 +204,11 @@ include(external/snappy) # download snappy include(external/snappystream) include(external/threadpool) +set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE) if(WITH_GPU) include(cuda) include(tensorrt) include(external/anakin) -else() - set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE) endif() include(cudnn) # set cudnn libraries, must before configure diff --git a/cmake/configure.cmake b/cmake/configure.cmake index e4af34d10ed92c501dd805addb62747c91c00978..c35096e09b5685ee30f20648e7dd461f71e0b1c4 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -97,6 +97,14 @@ if(WITH_GPU) endif() include_directories(${TENSORRT_INCLUDE_DIR}) endif() + if(WITH_ANAKIN) + if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) + message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile") + endif() + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile") + endif() + endif() elseif(WITH_AMD_GPU) add_definitions(-DPADDLE_WITH_HIP) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index 8b7d91f234594becdda805c089fac0bb4e4e8e44..403873a51017b9bb7c888bed77d89ca9b15b68d2 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -2,10 +2,22 @@ if (NOT WITH_ANAKIN) return() endif() -set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH - "Anakin install path." FORCE) -set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files") -set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") +INCLUDE(ExternalProject) +set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin) +# the anakin install dir is only default one now +set(ANAKIN_INSTALL_DIR ${THIRD_PARTY_PATH}/anakin/src/extern_anakin/output) +set(ANAKIN_INCLUDE ${ANAKIN_INSTALL_DIR}) +set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR}) +set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so) +set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so) + +# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later. +set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin") +execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}") +execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}") + +include_directories(${ANAKIN_INCLUDE}) +include_directories(${ANAKIN_INCLUDE}/saber/) set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable @@ -20,36 +32,33 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-reorder -Wno-error=cpp) -set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz") - -# A helper function used in Anakin, currently, to use it, one need to recursively include -# nearly all the header files. -function(fetch_include_recursively root_dir) - if (IS_DIRECTORY ${root_dir}) - include_directories(${root_dir}) - endif() - - file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) - foreach(sub ${ALL_SUB}) - if (IS_DIRECTORY ${root_dir}/${sub}) - fetch_include_recursively(${root_dir}/${sub}) - endif() - endforeach() -endfunction() - -if (NOT EXISTS "${ANAKIN_INSTALL_DIR}") - # download library - message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}") - execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") - execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*") - execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}") - execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") - execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz") -endif() +ExternalProject_Add( + extern_anakin + ${EXTERNAL_PROJECT_LOG_ARGS} + # TODO(luotao): use PaddlePaddle/Anakin later + GIT_REPOSITORY "https://github.com/luotao1/Anakin" + GIT_TAG "3957ae9263eaa0b1986758dac60a88852afb09be" + PREFIX ${ANAKIN_SOURCE_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DUSE_GPU_PLACE=YES + -DUSE_X86_PLACE=YES + -DBUILD_WITH_UNIT_TEST=NO + -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf + -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml + -DCUDNN_ROOT=${CUDNN_ROOT} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} +) -if (WITH_ANAKIN) - message(STATUS "Anakin for inference is enabled") - message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") - fetch_include_recursively(${ANAKIN_INCLUDE}) - link_directories(${ANAKIN_LIBRARY}) -endif() +message(STATUS "Anakin for inference is enabled") +message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") + +add_library(anakin_shared SHARED IMPORTED GLOBAL) +set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB}) +add_dependencies(anakin_shared extern_anakin protobuf mklml) + +add_library(anakin_saber SHARED IMPORTED GLOBAL) +set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) +add_dependencies(anakin_saber extern_anakin protobuf mklml) + +list(APPEND external_project_dependencies anakin_shared anakin_saber) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index aeb081e76e5bc5b9d3d81ce625195c800174ab6c..834ab5a9e527355d3664313d38cd4920f6fbf535 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU) copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api SRCS ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api - ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release + ${ANAKIN_INSTALL_DIR} # anakin release DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin) list(APPEND inference_deps anakin_inference_lib) endif() diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst index db2842e7f23e74130a966bb347004bee1ccb08fd..f23ecc1f80030f20359ce9675130a167722606c9 100644 --- a/doc/fluid/api/executor.rst +++ b/doc/fluid/api/executor.rst @@ -38,11 +38,3 @@ _switch_scope .. autofunction:: paddle.fluid.executor._switch_scope :noindex: -.. _api_fluid_executor_fetch_var: - -fetch_var ---------- - -.. autofunction:: paddle.fluid.executor.fetch_var - :noindex: - diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst index 51cdfe0c2ed045a5b3247c4fdec9868d756eae86..7eab58355c3648d929d3b5d98984adce9034f016 100644 --- a/doc/fluid/api/fluid.rst +++ b/doc/fluid/api/fluid.rst @@ -106,22 +106,6 @@ _switch_scope .. autofunction:: paddle.fluid._switch_scope :noindex: -.. _api_fluid_fetch_var: - -fetch_var ---------- - -.. autofunction:: paddle.fluid.fetch_var - :noindex: - -.. _api_fluid_Go: - -Go --- - -.. autoclass:: paddle.fluid.Go - :members: - :noindex: .. _api_fluid_make_channel: diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index dd172ff9c97814c089ddb2e5bf729880cf0c9cdb..46e56981ea57722bbc064304761e7ab7b7aee141 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) @@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) @@ -34,21 +37,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True)) -paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None) -paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -62,20 +54,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', ' paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False)) paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) paddle.fluid.InferenceTranspiler.__init__ -paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None) -paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None) paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0)) -paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None @@ -89,7 +77,7 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)) paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) @@ -338,14 +326,11 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) paddle.fluid.transpiler.InferenceTranspiler.__init__ -paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index a190199f1cb1361f67f20c755b8e7ef52c284adc..03f7e71c03b8dd75d2a47cb4c6d1ef1a71792cf3 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const { } } -int OpDesc::GetBlockAttr(const std::string &name) const { +std::vector OpDesc::GetBlocksAttrIds(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + auto blocks = boost::get>(it->second); + + std::vector ids; + for (auto n : blocks) { + ids.push_back(n->ID()); + } + + return ids; +} + +int OpDesc::GetBlockAttrId(const std::string &name) const { auto it = attrs_.find(name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); return boost::get(it->second)->ID(); diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 74dd8ec002005dd080424b48b5db1a2574a6974f..b77d84125a23b81c3de4123bea6f0e09cd6d1e90 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -83,7 +83,9 @@ class OpDesc { Attribute GetNullableAttr(const std::string &name) const; - int GetBlockAttr(const std::string &name) const; + int GetBlockAttrId(const std::string &name) const; + + std::vector GetBlocksAttrIds(const std::string &name) const; void Rename(const std::string &old_name, const std::string &new_name); diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 1e01a6e900404990e16674755367d2fc6d832725..20bdc7830f32564448a69e9cd76c02585b7a1aca 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { for (const std::string &attr_name : op->AttrNames()) { if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) { int sub_block_id = - o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name); + o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name); op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); } } diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index c4ab26a2288bb9d8f3cd54a797d2062e0606b219..9318f1089781b30468cf4d3c7151d0dd26e50a9c 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -44,13 +44,13 @@ class DfgPassManagerImpl final : public DfgPassManager { if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { auto trt_teller = [&](const Node* node) { std::unordered_set teller_set( - {"elementwise_add", "mul", "conv2d", "pool2d", "relu"}); + {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"}); if (!node->IsFunction()) return false; const auto* func = static_cast(node); - if (teller_set.count(func->func_type())) + if (teller_set.count(func->func_type())) { return true; - else { + } else { return false; } }; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 08d0f493ab30d92a121d089d9003bc575429b4dd..83867e0a2c198f265cb36be3a71546796dfbec67 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -45,7 +45,6 @@ endfunction(inference_api_test) cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor) - cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -62,22 +61,18 @@ inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_e endif() if (WITH_ANAKIN) # only needed in CI - # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, - # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to - # compile the libinference_anakin_api.a and compile with anakin.so. - fetch_include_recursively(${ANAKIN_INCLUDE}) # compile the libinference_anakin_api.a and anakin.so. - nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc) - nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc) - target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) - target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) - target_link_libraries(inference_anakin_api anakin anakin_saber_common) - target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) + nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) + #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin) + function(anakin_target target_name) + target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + endfunction() + anakin_target(inference_anakin_api) + #anakin_target(inference_anakin_api_shared) if (WITH_TESTING) - # this test is unstable, disable it first. - #cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc - #ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin - #DEPS inference_anakin_api_shared) - #target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) - endif(WITH_TESTING) + cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc + ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin + DEPS inference_anakin_api dynload_cuda SERIAL) + target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + endif(WITH_TESTING) endif() diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index b24414e8245b1a4d90acce4fa1ad5690e06b47dd..794534467be066e91db2b4c204913ab2cf12dbfd 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -45,7 +45,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 8f42a37cd3f8978b917b42e8f45a128b8422aa57..6863b035d8cd9dfb21aed3947226a796778912a4 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,7 +1,7 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc -activation_op.cc +activation_op.cc softmax_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -21,3 +21,6 @@ nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL) + +nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0064f90fd7944403c14d4d47616ea82f681ceb74 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights. + */ +class SoftMaxOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) + << "convert a fluid softmax op to tensorrt softmax layer without bias"; + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax, + *const_cast(input1)); + + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(softmax); +REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..503ce71f7fb4377bb4304569b7484fb25abdb284 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(SoftMaxOpConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(8, parameters, scope, 1000); + + std::vector tensor_shape{8, 10}; + validator.DeclInputVar("softmax-X", tensor_shape, + nvinfer1::DimsCHW(10, 1, 1)); + validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("softmax"); + desc.SetInput("X", {"softmax-X"}); + desc.SetOutput("Out", {"softmax-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(3); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(softmax); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 63c2f978f253df11100ecca83acae5eab6a0337d..4265f33f28fe36b1745baf4761c3c85e3a281d6b 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -79,6 +79,12 @@ class TRTConvertValidation { } // Declare a Variable as input with random initialization. + void DeclInputVar(const std::string& name, const std::vector tensor_dims, + const nvinfer1::Dims& trt_dims) { + DeclVar(name, tensor_dims); + engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims); + } + void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) { DeclVar(name, dims); // Declare TRT inputs. @@ -94,12 +100,18 @@ class TRTConvertValidation { DeclVar(name, dims); } - // Declare a variable in a fluid Scope. - void DeclVar(const std::string& name, const nvinfer1::Dims& dims, - bool is_param = false) { + void DeclVar(const std::string& name, const std::vector dim_vec) { platform::CPUPlace place; platform::CPUDeviceContext ctx(place); + auto* x = scope_.Var(name); + auto* x_tensor = x->GetMutable(); + x_tensor->Resize(framework::make_ddim(dim_vec)); + RandomizeTensor(x_tensor, place, ctx); + } + // Declare a variable in a fluid Scope. + void DeclVar(const std::string& name, const nvinfer1::Dims& dims, + bool is_param = false) { // Init Fluid tensor. std::vector dim_vec(dims.d, dims.d + dims.nbDims); // There is no batchsize in ITensor's shape, but We should add it to @@ -107,10 +119,8 @@ class TRTConvertValidation { // if_add_batch_ flag is true, add the max batchsize to dim_vec. if (is_param != true && if_add_batch_ == true) dim_vec.insert(dim_vec.begin(), max_batch_size_); - auto* x = scope_.Var(name); - auto* x_tensor = x->GetMutable(); - x_tensor->Resize(framework::make_ddim(dim_vec)); - RandomizeTensor(x_tensor, place, ctx); + + DeclVar(name, dim_vec); } void SetOp(const framework::proto::OpDesc& desc) { diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 8cd80ca6be45713ca471793c76f33bc018d697a2..ae37c7092983a0d181b1f399c847115719d2d089 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -235,7 +235,12 @@ else() endif() op_library(cross_entropy_op DEPS cross_entropy) -op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) +if(WITH_GPU) + op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub) +else() + op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) +endif() + op_library(softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) if (WITH_GPU AND TENSORRT_FOUND) @@ -273,9 +278,9 @@ op_library(squeeze_op DEPS reshape_op) op_library(extract_rows_op DEPS memory) op_library(flatten_op DEPS reshape_op) - if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) + op_library(layer_norm_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index 5b5a220cf90e7813f914ae35733e7a4103391b2d..a2a871efa850df5101be7c27ebd81456acace7e1 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -188,6 +188,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(crop_grad, ops::CropOpGrad); -REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CPU_KERNEL( + crop, ops::CropKernel); REGISTER_OP_CPU_KERNEL( crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu index 1a391860463dba14ad0de755ceb659bc9f64adc9..b75678217e36aa2297c68a7f8e2a9dfafadaca72 100644 --- a/paddle/fluid/operators/crop_op.cu +++ b/paddle/fluid/operators/crop_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/crop_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop, ops::CropKernel); REGISTER_OP_CUDA_KERNEL( crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index 772e80bbea4f2db654cefd0dcb404bc33803bd7a..2d7d33bd4f9b42b644444912570375bad92ba6c2 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -58,32 +58,74 @@ static std::vector GetOffsets(const framework::ExecutionContext& ctx) { return res; } -template +template +void CropFunction(const framework::ExecutionContext& context) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + auto out_dims = out->dims(); + if (out_dims[0] == -1) { + out_dims[0] = x->dims()[0]; + } + out->mutable_data(out_dims, context.GetPlace()); + auto x_stride = framework::stride(x->dims()); + auto out_stride = framework::stride(out->dims()); + auto offsets = GetOffsets(context); + int64_t offset = 0; + for (size_t i = 0; i < offsets.size(); ++i) { + offset += (x_stride[i] * offsets[i]); + } + + auto x_tensor = EigenTensor::From(*x); + auto out_tensor = EigenTensor::From(*out); + Eigen::array e_offsets; + Eigen::array e_shape; + for (size_t i = 0; i < D; ++i) { + e_offsets[i] = offsets[i]; + e_shape[i] = out->dims()[i]; + } + auto& place = + *context.template device_context().eigen_device(); + out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape); +} + +template class CropKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - const T* x_data = x->data(); - T* out_data = out->mutable_data(context.GetPlace()); - auto x_stride = framework::stride(x->dims()); - auto out_stride = framework::stride(out->dims()); - auto offsets = GetOffsets(context); - int64_t offset = 0; - for (size_t i = 0; i < offsets.size(); ++i) { - offset += (x_stride[i] * offsets[i]); + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + CropFunction(context); + break; + case 2: + CropFunction(context); + break; + case 3: + CropFunction(context); + break; + case 4: + CropFunction(context); + break; + case 5: + CropFunction(context); + break; + case 6: + CropFunction(context); + break; + default: + PADDLE_THROW( + "CropOp only support tensors with no more than 6 dimensions."); } - StridedMemcpy(context.device_context(), x_data + offset, x_stride, - out->dims(), out_stride, out_data); } }; template void CropGradFunction(const framework::ExecutionContext& context) { auto* d_x = context.Output(framework::GradVarName("X")); + auto* x = context.Input("X"); if (d_x != nullptr) { auto* d_out = context.Input(framework::GradVarName("Out")); - d_x->mutable_data(context.GetPlace()); + d_x->mutable_data(x->dims(), context.GetPlace()); auto offsets = GetOffsets(context); Eigen::array, D> paddings; for (size_t i = 0; i < D; ++i) { diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu index 6cbf6066c92b02eb75922587f9da5192bed15580..dfff518f170b56d180b6883c363effb8dbd677b6 100644 --- a/paddle/fluid/operators/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise_add_op.cu @@ -16,60 +16,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise_add_op.h" #include "paddle/fluid/platform/float16.h" -namespace paddle { -namespace operators { - -template -__global__ void ElementwiseAddCUDAKernel(const T *x, const T *y, T *z, int n, - int post, int size) { - int idx_x = threadIdx.x + blockIdx.x * blockDim.x; - if (idx_x < size) { - int idx_y = idx_x / post - (idx_x / (n * post)) * n; - z[idx_x] = x[idx_x] + y[idx_y]; - } -} - -template -class ElementwiseAddKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using Tensor = framework::Tensor; - - const auto x = ctx.Input("X"); - const auto y = ctx.Input("Y"); - auto z = ctx.Output("Out"); - auto *z_data = z->mutable_data(ctx.GetPlace()); - - auto &device = *(ctx.cuda_device_context().eigen_device()); - const framework::DDim &x_dim = x->dims(); - framework::DDim y_dim = y->dims(); - int size = x->numel(); - if (x_dim == y_dim) { - auto dim = framework::make_ddim({size}); - auto z_eigen = framework::EigenTensor::From(*z, dim); - auto x_eigen = framework::EigenTensor::From(*x, dim); - auto y_eigen = framework::EigenTensor::From(*y, dim); - z_eigen.device(device) = x_eigen + y_eigen; - } else { - int axis = ctx.Attr("axis"); - axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); - y_dim = trim_trailing_singular_dims(y_dim); - axis = (y_dim.size() == 0) ? x_dim.size() : axis; - int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); - int threads = 512; - int grids = (size + threads - 1) / threads; - auto stream = ctx.cuda_device_context().stream(); - ElementwiseAddCUDAKernel<<>>( - x->data(), y->data(), z_data, n, post, size); - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h index 0b19723720171a857c946880c246e2247a0023a7..5356105e2e551c0528694091608fc7585dce66d2 100644 --- a/paddle/fluid/operators/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise_add_op.h @@ -144,41 +144,16 @@ class ElementwiseAddGradKernel : public framework::OpKernel { auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + // skip out, x, y + auto* out = dout; + auto *x = dout, *y = dout; - if (dx != nullptr) { - // In fact, we can just share memory, but it may cause a bug of memory - // optimizer - // dx->ShareDataWith(*dout); - framework::TensorCopy(*dout, ctx.GetPlace(), - ctx.template device_context(), dx); - } - - if (dy == nullptr) return; - - const framework::DDim& x_dim = dout->dims(); - framework::DDim y_dim = dy->dims(); - if (x_dim == y_dim) { - // dy->ShareDataWith(*dout); - framework::TensorCopy(*dout, ctx.GetPlace(), - ctx.template device_context(), dy); + if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr && + dy != nullptr && (dx->dims() == dy->dims())) { + elementwise_add_grad(ctx, x, y, out, dout, dx, dy); } else { - dy->mutable_data(ctx.GetPlace()); - // Perform reduction to dout to calculate dy - int axis = ctx.Attr("axis"); - axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); - y_dim = trim_trailing_singular_dims(y_dim); - axis = (y_dim.size() == 0) ? x_dim.size() : axis; - - auto& device = - *(ctx.template device_context().eigen_device()); - int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); - auto eigen_dout = framework::EigenTensor::From( - *dout, framework::make_ddim({pre, n, post})); - auto eigen_dy = - framework::EigenTensor::From(*dy, framework::make_ddim({n})); - eigen_dy.device(device) = eigen_dout.sum( - framework::EigenDim<2>::From(framework::make_ddim({0, 2}))); + default_elementwise_add_grad(ctx, x, y, out, dout, dx, + dy); } } }; diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index 6840e1e08f3d5bc84a05f15e30982c7cfb59680b..0886c41a1b582881faf24f5531d414db4e4db71c 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,512 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/operators/layer_norm_op.h" +namespace paddle { +namespace operators { + +inline static int GetDesiredBlockDim(int block_dim) { + const int kMaxBlockDim = 512; + return block_dim >= kMaxBlockDim + ? kMaxBlockDim + : (1 << (static_cast(std::log2f(block_dim)))); +} + +#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ + case (1 << (log2_block_dim)): { \ + constexpr auto kBlockDim = (1 << (log2_block_dim)); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM_CASE(...) \ + FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__) + +static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); } +static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); } + +template +struct PairForLayerNorm { + __device__ __forceinline__ PairForLayerNorm() {} + __device__ __forceinline__ PairForLayerNorm(const T &first, const T &second) + : first_(first), second_(second) {} + + T first_; + T second_; +}; + +template +struct PairForLayerNormAddFunctor { + __device__ __forceinline__ PairForLayerNorm operator()( + const PairForLayerNorm &p1, const PairForLayerNorm &p2) { + return PairForLayerNorm(p1.first_ + p2.first_, p1.second_ + p2.second_); + } +}; + +template +__global__ void LayerNormForward(const T *x, const T *scale, const T *bias, + T *y, T *mean, T *var, float epsilon, + int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + + int beg_idx = blockIdx.x * feature_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * feature_size; + + // Step 1: Reduce to calculate mean and var + T mean_val = static_cast(0); + T var_val = static_cast(0); + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T tmp = x[i]; + mean_val += tmp; + var_val += (tmp * tmp); + } + auto pair = BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(mean_val, var_val), + PairForLayerNormAddFunctor()); + if (threadIdx.x == 0) { + auto tmp = pair.first_ / feature_size; + mean[blockIdx.x] = tmp; + var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp; + } + __syncthreads(); + mean_val = mean[blockIdx.x]; + var_val = static_cast(real_sqrt(var[blockIdx.x] + epsilon)); + + // Step 2: Calculate y + if (scale != nullptr) { + if (bias != nullptr) { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = scale[j] * (x[i] - mean_val) / var_val + bias[j]; + } + } else { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = scale[j] * (x[i] - mean_val) / var_val; + } + } + } else { // scale == nullptr + if (bias != nullptr) { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = (x[i] - mean_val) / var_val + bias[j]; + } + } else { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = (x[i] - mean_val) / var_val; + } + } + } +} + +// Make sure that d_scale != nullptr && d_bias != nullptr +// Since d_scale != nullptr, scale would not be nullptr +template +__global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, + T *d_scale, T *d_bias, T *d_x, + const T *mean, const T *var, + const T *scale, float epsilon, + int batch_size, int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + + int beg_idx = threadIdx.x * feature_size + blockIdx.x; + int end_idx = batch_size * feature_size + blockIdx.x; + int stride = BlockDim * feature_size; + + T d_scale_partial = 0, d_bias_partial = 0; + + for (int i = beg_idx; i < end_idx; i += stride) { + int row_idx = i / feature_size; + auto var_val = static_cast(real_sqrt(var[row_idx] + epsilon)); + d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val; + d_bias_partial += d_y[i]; + if (HasDx) { + d_x[i] = d_y[i] * scale[blockIdx.x] / var_val; + } + } + + auto pair = BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(d_scale_partial, d_bias_partial), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + d_scale[blockIdx.x] = pair.first_; + d_bias[blockIdx.x] = pair.second_; + } +} + +// Make sure that there is only one true expression: d_scale != nullptr +// or d_bias != nullptr +// Notice: scale may be nullptr +template +__global__ void LayerNormBackwardGradientScaleOrBias( + const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean, + const T *var, const T *scale, float epsilon, int batch_size, + int feature_size) { + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + int beg_idx = threadIdx.x * feature_size + blockIdx.x; + int end_idx = batch_size * feature_size + blockIdx.x; + int stride = BlockDim * feature_size; + T d_scale_or_d_bias_partial = 0; + + for (int i = beg_idx; i < end_idx; i += stride) { + int row_idx = i / feature_size; + auto var_val = static_cast(real_sqrt(var[row_idx] + epsilon)); + if (HasDScale) { + d_scale_or_d_bias_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val; + } else { // d_bias != nullptr + d_scale_or_d_bias_partial += d_y[i]; + } + + if (HasDx) { + if (scale != nullptr) { + d_x[i] = d_y[i] * scale[blockIdx.x] / var_val; + } else { + d_x[i] = d_y[i] / var_val; + } + } + } + + d_scale_or_d_bias_partial = + BlockReduce(temp_storage).Reduce(d_scale_or_d_bias_partial, cub::Sum()); + + if (threadIdx.x == 0) { + if (HasDScale) { + d_scale[blockIdx.x] = d_scale_or_d_bias_partial; + } else { + d_bias[blockIdx.x] = d_scale_or_d_bias_partial; + } + } +} + +template +__global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x, + const T *mean, + const T *var, + float epsilon, + int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T d_x_reduce_tmp[2]; + + int beg_idx = blockIdx.x * feature_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * feature_size; + + T block_mean = mean[blockIdx.x]; + T block_var = var[blockIdx.x]; + T d_x_mean_partial = 0, d_x_var_partial = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + d_x_mean_partial += d_x[i]; + d_x_var_partial += d_x[i] * (x[i] - block_mean); + } + + auto pair = + BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + d_x_reduce_tmp[0] = pair.first_ / feature_size; + d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon)); + } + __syncthreads(); + + d_x_mean_partial = d_x_reduce_tmp[0]; + d_x_var_partial = d_x_reduce_tmp[1]; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + d_x[i] -= d_x_mean_partial; + d_x[i] -= (x[i] - block_mean) * d_x_var_partial; + } +} + +// Here, we only calculate d_x +template +__global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y, + T *d_x, const T *mean, + const T *var, const T *scale, + float epsilon, + int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T d_x_reduce_tmp[2]; + + int beg_idx = blockIdx.x * feature_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * feature_size; + + T block_mean = mean[blockIdx.x], block_var = var[blockIdx.x]; + T d_x_mean_partial = 0, d_x_var_partial = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + auto var_val = static_cast(real_sqrt(block_var + epsilon)); + if (scale != nullptr) { + int col_idx = i % feature_size; + d_x[i] = d_y[i] * scale[col_idx] / var_val; + } else { + d_x[i] = d_y[i] / var_val; + } + d_x_mean_partial += d_x[i]; + d_x_var_partial += d_x[i] * (x[i] - block_mean); + } + + auto pair = + BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + d_x_reduce_tmp[0] = pair.first_ / feature_size; + d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon)); + } + __syncthreads(); + + d_x_mean_partial = d_x_reduce_tmp[0]; + d_x_var_partial = d_x_reduce_tmp[1]; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + d_x[i] -= d_x_mean_partial; + d_x[i] -= (x[i] - block_mean) * d_x_var_partial; + } +} + +template +__global__ void LayerNormBackwardWhenBatchSizeIsOne( + const T *x, const T *d_y, T *d_x, T *d_scale, T *d_bias, const T *mean, + const T *var, const T *scale, float epsilon, int feature_size) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < feature_size) { + auto var_val = static_cast(real_sqrt(var[idx] + epsilon)); + if (d_x != nullptr) { + if (d_scale == nullptr) { + d_x[idx] = d_y[idx] / var_val; + } else { + d_x[idx] = d_y[idx] * scale[idx] / var_val; + } + } + + if (d_scale != nullptr) { + d_scale[idx] = d_y[idx] * (x[idx] - mean[idx]) / var_val; + } + + if (d_bias != nullptr) d_bias[idx] = d_y[idx]; + } +} + +template +static void LayerNormBackward(const T *x, const T *d_y, const T *scale, + const T *mean, const T *var, T *d_x, T *d_scale, + T *d_bias, float epsilon, int batch_size, + int feature_size, cudaStream_t stream) { + const int kMaxBlockDim = 512; + int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) | + ((d_scale != nullptr ? 1 : 0) << 1) | + ((d_bias != nullptr ? 1 : 0)); + if (gradient_flag == 0) return; + + if (batch_size == 1) { + LayerNormBackwardWhenBatchSizeIsOne< + T><<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0, + stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon, + feature_size); + + if (d_x != nullptr) { + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<<1, kBlockDim, 0, stream>>>( + x, d_x, mean, var, epsilon, feature_size)); + } + } + return; + } + + auto block_dim = GetDesiredBlockDim(batch_size); + switch (gradient_flag) { + case 1: // d_x == nulptr, d_scale == nullptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, false, + false><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + break; + case 2: // d_x == nullptr, d_scale != nullptr, d_bias == nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, false, + true><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + break; + case 3: // d_x == nullptr, d_scale != nulptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardGradientAll< + T, kBlockDim, false><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, + batch_size, feature_size)); + } + break; + case 4: // d_x != nullptr, d_scale == nullptr, d_bias == nullptr + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardGradientOnlyDX< + T, kBlockDim><<>>( + x, d_y, d_x, mean, var, scale, epsilon, feature_size)); + } + break; + case 5: // d_x != nulptr, d_scale == nullptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, true, + false><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<>>( + x, d_x, mean, var, epsilon, feature_size)); + } + break; + case 6: // d_x != nullptr, d_scale != nullptr, d_bias == nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, true, + true><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<>>( + x, d_x, mean, var, epsilon, feature_size)); + } + break; + case 7: // d_x != nullptr, d_scale != nullptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardGradientAll< + T, kBlockDim, true><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, + batch_size, feature_size)); + } + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<>>( + x, d_x, mean, var, epsilon, feature_size)); + } + break; + default: + break; + } +} + +template +class LayerNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto *x = ctx.Input("X"); + + auto *y = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + const auto x_dims = x->dims(); + auto *x_data = x->data(); + auto *y_data = y->mutable_data(ctx.GetPlace()); + auto *mean_data = mean->mutable_data(ctx.GetPlace()); + auto *var_data = var->mutable_data(ctx.GetPlace()); + auto *scale_data = (scale == nullptr ? nullptr : scale->data()); + auto *bias_data = (bias == nullptr ? nullptr : bias->data()); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int batch_size = static_cast(matrix_dim[0]); + int feature_size = static_cast(matrix_dim[1]); + + auto stream = ctx.cuda_device_context().stream(); + + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormForward<<>>( + x_data, scale_data, bias_data, y_data, mean_data, var_data, + epsilon, feature_size)); + default: + PADDLE_THROW( + "Product from begin_norm_axis to end must be larger than 1"); + break; + } + } +}; + +template +class LayerNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + // d_x, d_scale, d_bias may be nullptr + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + auto *x = ctx.Input("X"); + auto *mean = ctx.Input("Mean"); + auto *var = ctx.Input("Variance"); + auto *scale = ctx.Input("Scale"); + auto *d_y = ctx.Input(framework::GradVarName("Y")); + + auto *x_data = x->data(); + auto *d_y_data = d_y->data(); + auto *mean_data = mean->data(); + auto *var_data = var->data(); + auto *scale_data = (scale == nullptr ? nullptr : scale->data()); + auto *d_scale_data = + (d_scale == nullptr ? nullptr + : d_scale->mutable_data(ctx.GetPlace())); + auto *d_bias_data = + (d_bias == nullptr ? nullptr : d_bias->mutable_data(ctx.GetPlace())); + auto *d_x_data = + (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); + + const auto &x_dims = x->dims(); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int batch_size = static_cast(matrix_dim[0]); + int feature_size = static_cast(matrix_dim[1]); + + auto stream = ctx.cuda_device_context().stream(); + + LayerNormBackward(x_data, d_y_data, scale_data, mean_data, var_data, + d_x_data, d_scale_data, d_bias_data, epsilon, + batch_size, feature_size, stream); + } +}; + +#undef FIXED_BLOCK_DIM_CASE_BASE +#undef FIXED_BLOCK_DIM_CASE +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( layer_norm, diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 2199f5311fd3728e624fc222a1b876eb947cc0aa..be623703c2480774bb04a6bc0c5b00b699d7bb16 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) { std::string ser(seriralized); self.SetAttr(name, ser); }) - .def("block_attr", &pd::OpDesc::GetBlockAttr) + .def("block_attr_id", &pd::OpDesc::GetBlockAttrId) + .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) .def("check_attrs", &pd::OpDesc::CheckAttrs) .def("infer_shape", &pd::OpDesc::InferShape) .def("infer_var_type", &pd::OpDesc::InferVarType) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2320f3e4dbc5d98698670fee19ed983411a802a9..7127bb38f6ddf8a55c1741d1f0ef18c8d9067fba 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -664,7 +664,7 @@ All parameter, weight, gradient are variables in Paddle. const std::string &, Scope *, std::vector &, const ExecutionStrategy &, const BuildStrategy &, size_t, size_t>()) - .def("bcast_params", &ParallelExecutor::BCastParamsToDevices) + .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index cccb4abe6cf14ac3e90ba59d970c9398b09b7db2..1ae05dec8de6b9cbc9568f0f4d437833be520f8d 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -48,8 +48,6 @@ from .data_feeder import DataFeeder from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from .transpiler import DistributeTranspiler, InferenceTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig -from .concurrency import (Go, make_channel, channel_send, channel_recv, - channel_close, Select) from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from . import clip from . import profiler @@ -61,7 +59,7 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor -__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ +__all__ = framework.__all__ + executor.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ parallel_executor.__all__ + lod_tensor.__all__ + [ 'io', diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 6b739745119836a80518d7bc6138b37101e74431..fd6a76dd0cfa347328d87093884e5cd324395497 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -344,7 +344,7 @@ def _append_backward_ops_(block, grad_sub_block_list = [] # If the op has its own sub-block, deal with the sub-block first if op.has_attr("sub_block"): - sub_block = program.block(op.block_attr("sub_block")) + sub_block = program.block(op.block_attr_id("sub_block")) grad_sub_block = program.create_block() grad_sub_block._set_forward_block_idx(sub_block.idx) cb = _callback_lookup_(op) @@ -406,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): for op_idx in range(start_op_idx, block.desc.op_size()): op_desc = block.desc.op(op_idx) if op_desc.has_attr("sub_block"): - sub_block = block.program.block(op_desc.block_attr("sub_block")) + sub_block = block.program.block(op_desc.block_attr_id("sub_block")) _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) new_vars = set() # create new gradient variables diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py index a8c4d66720d6eda857e5960d86fc3b8ec8f11ade..676a52a917dd1f9700ec38de32932938ec339be5 100644 --- a/python/paddle/fluid/concurrency.py +++ b/python/paddle/fluid/concurrency.py @@ -19,8 +19,7 @@ from .layers import fill_constant from . import core __all__ = [ - 'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close', - 'Select' + 'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select' ] @@ -35,10 +34,10 @@ class Go(BlockGuard): def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is not None: return False - self.construct_go_op() + self._construct_go_op() return super(Go, self).__exit__(exc_type, exc_val, exc_tb) - def construct_go_op(self): + def _construct_go_op(self): main_program = self.helper.main_program go_block = main_program.current_block() parent_block = main_program.block(main_program.current_block() diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 35da1d06a2c1da8ba663ea0f0b9a0e58ea7c4470..e24b9faae24084ccc743a5b5126db9667089e128 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -18,9 +18,7 @@ import six from .framework import Program, default_main_program, Variable from . import core -__all__ = [ - 'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var' -] +__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope'] g_scope = core.Scope() @@ -171,7 +169,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): return fetch_count > 0 -def fetch_var(name, scope=None, return_numpy=True): +def _fetch_var(name, scope=None, return_numpy=True): """ Fetch the value of the variable with the given name from the given scope. @@ -222,6 +220,37 @@ def _get_program_cache_key(feed, fetch_list): return str(feed_var_names + fetch_var_names) +def _as_lodtensor(data, place): + """ + Convert numpy.ndarray to Tensor, its only support Tensor without LoD information. + For higher dimensional sequence data, please use LoDTensor directly. + + Examples: + >>> import paddle.fluid as fluid + >>> place = fluid.CPUPlace() + >>> exe = fluid.executor(place) + >>> data = np.array(size=(100, 200, 300)) + >>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data) + >>> ... + + Args: + data(numpy.ndarray): a instance of array + + Returns: + LoDTensor + """ + if isinstance(data, list): + raise RuntimeError("Some of your feed data hold LoD information. \ + They can not be completely cast from a list of Python \ + ndarray to LoDTensor. Please convert data to LoDTensor \ + directly before feeding the data.\ + ") + # single tensor case + tensor = core.LoDTensor() + tensor.set(data, place) + return tensor + + class Executor(object): """ An Executor in Python, only support the single-GPU running. For multi-cards, please refer to @@ -250,35 +279,6 @@ class Executor(object): self.program_caches = dict() self._closed = False - def as_lodtensor(self, data): - """ - Convert numpy.ndarray to Tensor, its only support Tensor without LoD information. - For higher dimensional sequence data, please use LoDTensor directly. - - Examples: - >>> import paddle.fluid as fluid - >>> exe = fluid.executor(fluid.CPUPlace()) - >>> data = np.array(size=(100, 200, 300)) - >>> np_outs = map(lambda x: exe.as_lodtensor(x), data) - >>> ... - - Args: - data(numpy.ndarray): a instance of array - - Returns: - LoDTensor - """ - if isinstance(data, list): - raise RuntimeError("Some of your feed data hold LoD information. \ - They can not be completely cast from a list of Python \ - ndarray to LoDTensor. Please convert data to LoDTensor \ - directly before feeding the data.\ - ") - # single tensor case - tensor = core.LoDTensor() - tensor.set(data, self.place) - return tensor - def _get_program_cache(self, program_cache_key): return self.program_caches.get(program_cache_key, None) @@ -337,7 +337,7 @@ class Executor(object): feed_target_name = op.desc.output('Out')[0] cur_feed = feed[feed_target_name] if not isinstance(cur_feed, core.LoDTensor): - cur_feed = self.as_lodtensor(cur_feed) + cur_feed = _as_lodtensor(cur_feed, self.place) idx = op.desc.attr('col') core.set_feed_variable(scope, cur_feed, feed_var_name, idx) else: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3d7c29c6ea33ebc49b071681c5f9746b97445aa3..45b3abb88c9431f52705bb62df2c32779dd0cf9d 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -476,23 +476,25 @@ class Operator(object): attrs=None): self.block = block self.desc = desc - self.attrs = attrs - if self.attrs is None: - self.attrs = dict() + # note: not add self.attrs here: + # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173 + op_attrs = attrs + if op_attrs is None: + op_attrs = dict() del attrs op_maker = core.op_proto_and_checker_maker - if op_maker.kOpRoleAttrName() not in self.attrs: - self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role + if op_maker.kOpRoleAttrName() not in op_attrs: + op_attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role role_var_name = op_maker.kOpRoleVarAttrName() if len(self.block.program. - op_role_var) != 0 and role_var_name not in self.attrs: - self.attrs[role_var_name] = self.block.program.op_role_var + op_role_var) != 0 and role_var_name not in op_attrs: + op_attrs[role_var_name] = self.block.program.op_role_var - if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0: - del self.attrs[role_var_name] + if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: + del op_attrs[role_var_name] if len(self.desc.type()) != 0: return @@ -576,15 +578,14 @@ class Operator(object): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) - if self.attrs is not None: - if not isinstance(self.attrs, dict): + if op_attrs is not None: + if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") for attr in proto.attrs: attr_name = attr.name - if (attr_name not in self.attrs) or ( - self.attrs[attr_name] is None): + if (attr_name not in op_attrs) or (op_attrs[attr_name] is None): continue - attr_val = self.attrs[attr_name] + attr_val = op_attrs[attr_name] self._update_desc_attr(attr_name, attr_val) self.desc.check_attrs() @@ -732,7 +733,6 @@ class Operator(object): Raises: ValueError: If the type of value doesn't match with desc.attr_type(name). """ - self.attrs[name] = val self._update_desc_attr(name, val) def _update_desc_attr(self, name, val): @@ -774,9 +774,9 @@ class Operator(object): """ return self.desc.attr(name) - def block_attr(self, name): + def block_attr_id(self, name): """ - Get the block attribute by name. + Get the block attribute's id by name. Args: name(str): the attribute name. @@ -784,22 +784,74 @@ class Operator(object): Returns: int: the block index. """ - return self.desc.block_attr(name) + return self.desc.block_attr_id(name) + + def block_attr(self, name): + """ + Get the block attribute by name. + + Args: + name(str): the attribute name. + + Returns: + block: the block attribute. + """ + + id = self.block_attr_id(name) + assert (id >= 0 and id < len(self.block.program.blocks)) + return self.block.program.blocks[id] + + def blocks_attr(self, name): + """ + Get the blocks attribute by name. + + Args: + name(str): the attribute name. + + Returns: + list: list of the blocks attribute. + """ + attrs = [] + for i in self.blocks_attr_ids(name): + assert (i >= 0 and i < len(self.block.program.blocks)) + attrs.append(self.block.program.blocks[i]) + + return attrs + + def blocks_attr_ids(self, name): + """ + Get the blocks attribute's ids by name. + + Args: + name(str): the attribute name. + + Returns: + list: list of the blocks ids. + """ + + return self.desc.blocks_attr_ids(name) def all_attrs(self): """ Get the attribute dict. Returns: - dict: The Operator's attribute dict. + dict: The Operator's attribute dict, name->attr. """ attr_names = self.attr_names attr_map = {} for n in attr_names: - if n == 'sub_block': + attr_type = self.desc.attr_type(n) + if attr_type == core.AttrType.BLOCK: attr_map[n] = self.block_attr(n) - else: - attr_map[n] = self.attr(n) + continue + + if attr_type == core.AttrType.BLOCKS: + attr_map[n] = self.blocks_attr(n) + continue + + attr_map[n] = self.attr(n) + return attr_map @@ -1518,11 +1570,17 @@ class Program(object): The two code snippets above will generate same programs. """ if for_test: - p = self.inference_optimize() + p = self.inference_optimize(export_for_deployment=False) else: p = Program() + p.current_block_idx = self.current_block_idx + p._seed = self._seed p.desc = core.ProgramDesc(self.desc) - p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())] + p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())] + + p._current_role = self._current_role + p._op_role_var = self._op_role_var + p._sync_with_cpp() p._copy_param_info_from(self) @@ -1578,7 +1636,7 @@ class Program(object): res._sync_with_cpp() return res - def inference_optimize(self): + def inference_optimize(self, export_for_deployment=True): """ This method will create a new program and do following adjustments on it: 1. Remove all reader variables and their creator ops if exist. @@ -1589,6 +1647,10 @@ class Program(object): attribute of operators to :code:`True`. All the :code:`Parameter` information will be lost. + Args: + export_for_deployment(bool): remove the read ops that are added by py_reader + for cpp inference library + Notes: This API is a very low level API. Use :code:`Program.clone(for_test=True)` instead. @@ -1603,16 +1665,17 @@ class Program(object): # remove all readers and the read_op if exist read_op_idx = 0 root_block = res.desc.block(0) - while True: - if read_op_idx >= root_block.op_size() or root_block.op( - read_op_idx).type() == 'read': - break - read_op_idx += 1 - if read_op_idx < root_block.op_size(): - root_block._remove_op(0, read_op_idx + 1) - for var in root_block.all_vars(): - if var.type() == core.VarDesc.VarType.READER: - root_block._remove_var(var.name()) + if export_for_deployment: + while True: + if read_op_idx >= root_block.op_size() or root_block.op( + read_op_idx).type() == 'read': + break + read_op_idx += 1 + if read_op_idx < root_block.op_size(): + root_block._remove_op(0, read_op_idx + 1) + for var in root_block.all_vars(): + if var.type() == core.VarDesc.VarType.READER: + root_block._remove_var(var.name()) # change all `is_test` attributes to True for i in range(res.desc.num_blocks()): diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 83290ac60839b855a0348696ad6898af7335e2fc..3f740dd7c539973c5bb2b260120729e9563eedd2 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -264,7 +264,8 @@ class NormalInitializer(Initializer): "dtype": int(var.dtype), "mean": self._mean, "std": self._std_dev, - "seed": self._seed + "seed": self._seed, + "use_mkldnn": False }) var.op = op return op diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 55e517f1f4a48beddf62fe875c2064da09b2f6b0..af734210323913a36f861380dc38a98253aca0a1 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -555,7 +555,8 @@ def save_inference_model(dirname, executor, main_program=None, model_filename=None, - params_filename=None): + params_filename=None, + export_for_deployment=True): """ Prune the given `main_program` to build a new program especially for inference, and then save it and all related parameters to given `dirname` by the `executor`. @@ -577,6 +578,8 @@ def save_inference_model(dirname, params_filename(str|None): The name of file to save all related parameters. If it is setted None, parameters will be saved in separate files . + export_for_deployment(bool): remove the read ops that are added by py_reader + for cpp inference lib. Default True Returns: None @@ -643,7 +646,8 @@ def save_inference_model(dirname, copy_program.desc.flush() pruned_program = copy_program.prune(targets=target_vars) - inference_program = pruned_program.inference_optimize() + inference_program = pruned_program.inference_optimize( + export_for_deployment=export_for_deployment) fetch_var_names = [v.name for v in target_vars] prepend_feed_ops(inference_program, feeded_var_names) diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index fe1155b2930d3ce5a062a2d5a92e356e73686c8a..0800c02d9ec7dc51d4f22d3b04f3ae90315ebad8 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -164,7 +164,7 @@ def rpn_target_assign(loc, }) # 4. Reshape and gather the target entry - scores = nn.reshape(x=scores, shape=(-1, 1)) + scores = nn.reshape(x=scores, shape=(-1, 2)) loc = nn.reshape(x=loc, shape=(-1, 4)) target_label = nn.reshape(x=target_label, shape=(-1, 1)) target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4)) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index eabe6bb901ad1b7afe71717eb3f5f5765f261201..2a3555ebdde4d54f63bb420218896560c1b40ffd 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -273,19 +273,19 @@ class ParallelExecutor(object): arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() if self.is_dist: - self.bcast_params() + self._bcast_params() if return_numpy: return executor.as_numpy(arr) return [arr[i] for i in range(len(arr))] - def bcast_params(self): + def _bcast_params(self): """ Broadcast the parameters to other devices. It is used during distributed training. """ - self.executor.bcast_params(set(self.persistable_vars)) + self.executor._bcast_params(set(self.persistable_vars)) @property def device_count(self): diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py new file mode 100644 index 0000000000000000000000000000000000000000..8603d3a5b3b5d368fe87b8dcf9dc7363f95caf86 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py @@ -0,0 +1,196 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +import collections + +SEED = 1 +DTYPE = "float32" +paddle.dataset.mnist.fetch() + + +# random seed must set before configuring the network. +# fluid.default_startup_program().random_seed = SEED +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + # TODO(dzhwinter) : refine the initializer and random seed settting + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + return predict + + +def get_model(batch_size): + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + # Optimization + opt = fluid.optimizer.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + opt.minimize(avg_cost) + return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict + + +def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id=trainer_id, + program=main_program, + pservers=pserver_endpoints, + trainers=trainers) + return t + + +def operator_equal(a, b): + for k, v in a.__dict__.iteritems(): + if isinstance(v, fluid.framework.Program) or \ + isinstance(v, fluid.framework.Block): + continue + + elif isinstance(v, core.OpDesc): + if v.serialize_to_string() != b.__dict__[k].serialize_to_string(): + raise ValueError("In operator_equal not equal:{0}\n".format(k)) + + elif isinstance(v, collections.OrderedDict): + v0 = sorted(v.iteritems(), key=lambda x: x[0]) + v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0]) + + if v0 != v1: + raise ValueError("In operator_equal not equal:{0}\n".format(k)) + + elif (v != b.__dict__[k]): + raise ValueError("In operator_equal not equal:{0}\n".format(k)) + + return True + + +def block_equal(a, b): + for k, v in a.__dict__.iteritems(): + if isinstance(v, core.ProgramDesc) or isinstance( + v, fluid.framework.Program) or isinstance(v, core.BlockDesc): + continue + + elif k == "ops": + for i in range(0, len(a.ops)): + if not operator_equal(a.ops[i], b.ops[i]): + raise ValueError("In block_equal not equal:{0}\n".format(k)) + assert (len(a.ops) == len(b.ops)) + + elif isinstance(v, collections.OrderedDict): + v0 = sorted(v.iteritems(), key=lambda x: x[0]) + v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0]) + + if v0 != v1: + raise ValueError("In block_equal not equal:{0}\n".format(k)) + + elif (v != b.__dict__[k]): + raise ValueError("In block_equal not equal:{0}\n".format(k)) + + return True + + +def program_equal(a, b): + for k, v in a.__dict__.iteritems(): + if isinstance(v, core.ProgramDesc): + continue + + elif k == 'blocks': + for i in range(0, len(a.blocks)): + if not block_equal(a.blocks[i], b.blocks[i]): + raise ValueError("In operator_equal not equal:{0}\n".format( + k)) + return False + assert (len(a.blocks) == len(b.blocks)) + + elif (v != b.__dict__[k]): + raise ValueError("In program_equal not equal:{0}\n".format(k)) + + return True + + +class TestDistMnist(unittest.TestCase): + def test_desc_clone(self): + get_model(batch_size=20) + + pserver_endpoints = "127.0.0.1:9123" + trainers = 1 + current_endpoint = "127.0.0.1:9123" + t = get_transpiler(0, + fluid.default_main_program(), pserver_endpoints, + trainers) + + pserver_prog = t.get_pserver_program(current_endpoint) + startup_prog = t.get_startup_program(current_endpoint, pserver_prog) + main = pserver_prog.clone() + startup = startup_prog.clone() + + self.assertTrue(program_equal(main, pserver_prog)) + self.assertTrue(program_equal(startup, startup_prog)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 1deccbe4af542cce32f0cbb4299cbe73d4df1111..4379463aca4443eb7a886ce78446440cc59f3b30 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -130,7 +130,7 @@ class TestDistBase(unittest.TestCase): self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._python_interp = "python" - def start_pserver(self, model_file): + def start_pserver(self, model_file, check_error_log): ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ (self._python_interp, model_file, self._ps_endpoints, ps0_ep, @@ -139,11 +139,23 @@ class TestDistBase(unittest.TestCase): (self._python_interp, model_file, self._ps_endpoints, ps1_ep, self._trainers) + ps0_pipe = subprocess.PIPE + ps1_pipe = subprocess.PIPE + if check_error_log: + print("ps0_cmd:", ps0_cmd) + print("ps1_cmd:", ps1_cmd) + ps0_pipe = open("/tmp/ps0_err.log", "wb") + ps1_pipe = open("/tmp/ps1_err.log", "wb") + ps0_proc = subprocess.Popen( - ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe) ps1_proc = subprocess.Popen( - ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return ps0_proc, ps1_proc + ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe) + + if not check_error_log: + return ps0_proc, ps1_proc, None, None + else: + return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe def _wait_ps_ready(self, pid): retry_times = 50 @@ -160,7 +172,7 @@ class TestDistBase(unittest.TestCase): (e, retry_times)) retry_times -= 1 - def check_with_place(self, model_file, delta=1e-3): + def check_with_place(self, model_file, delta=1e-3, check_error_log=False): # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN required_envs = { "PATH": os.getenv("PATH"), @@ -169,17 +181,32 @@ class TestDistBase(unittest.TestCase): "FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_cudnn_deterministic": "1" } + + if check_error_log: + required_envs["GLOG_v"] = "7" + required_envs["GLOG_logtostderr"] = "1" + # Run local to get a base line env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local.update(required_envs) local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \ (self._python_interp, model_file, "127.0.0.1:1234", "127.0.0.1:1234", 1) - local_proc = subprocess.Popen( - local_cmd.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env_local) + if not check_error_log: + local_proc = subprocess.Popen( + local_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env_local) + else: + print("trainer cmd:", local_cmd) + err_log = open("/tmp/trainer.err.log", "wb") + local_proc = subprocess.Popen( + local_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=err_log, + env=env_local) + local_proc.wait() out, err = local_proc.communicate() local_ret = out @@ -187,7 +214,8 @@ class TestDistBase(unittest.TestCase): sys.stderr.write('local_stderr: %s\n' % err) # Run dist train to compare with local results - ps0, ps1 = self.start_pserver(model_file) + ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file, + check_error_log) self._wait_ps_ready(ps0.pid) self._wait_ps_ready(ps1.pid) @@ -205,15 +233,23 @@ class TestDistBase(unittest.TestCase): env1.update(required_envs) FNULL = open(os.devnull, 'w') + tr0_pipe = subprocess.PIPE + tr1_pipe = subprocess.PIPE + if check_error_log: + print("tr0_cmd:", tr0_cmd) + print("tr1_cmd:", tr1_cmd) + tr0_pipe = open("/tmp/tr0_err.log", "wb") + tr1_pipe = open("/tmp/tr1_err.log", "wb") + tr0_proc = subprocess.Popen( tr0_cmd.split(" "), stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stderr=tr0_pipe, env=env0) tr1_proc = subprocess.Popen( tr1_cmd.split(" "), stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stderr=tr1_pipe, env=env1) tr0_proc.wait() @@ -230,6 +266,13 @@ class TestDistBase(unittest.TestCase): local_first_loss = eval(local_lines[0])[0] local_last_loss = eval(local_lines[1])[0] + # close trainer file + if check_error_log: + tr0_pipe.close() + tr1_pipe.close() + + ps0_pipe.close() + ps1_pipe.close() # FIXME: use terminate() instead of sigkill. os.kill(ps0.pid, signal.SIGKILL) os.kill(ps1.pid, signal.SIGKILL) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 3bc5e6ada5dabfcad38278db6e14d0b115c5050d..0543e6238109bf5bab68a9f08fc34678936e756f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -259,7 +259,7 @@ class TestLRDecayConditional(TranspilerTest): serv_op = pserver.blocks[0].ops[0] sub_blocks = [] optimize_blocks = [] - for b in serv_op.attrs["optimize_blocks"]: + for b in serv_op.all_attrs()["optimize_blocks"]: optimize_blocks.append(b.idx) for b in pserver.blocks: if b.idx not in optimize_blocks: @@ -536,5 +536,35 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) +class TestRMSPropOptimizer(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) + optimizer.minimize(avg_cost) + return + + def transpiler_test_impl(self): + pserver, startup = self.get_pserver(self.pserver1_ep) + pserver2, startup2 = self.get_pserver(self.pserver2_ep) + + self.assertEqual(len(pserver.blocks), 3) + # block1~2: optimize pass + self.assertEqual([op.type for op in pserver.blocks[1].ops], + ["sum", "scale", "rmsprop"]) + # the variable #fc_w will be split into two blocks + fc_w_var = startup.global_block().var("fc_w.block1") + self.assertEqual(fc_w_var.shape, (500, 1000)) + moment_var = startup.global_block().var("momentum_1") + self.assertEqual(moment_var.shape, (500, 1000)) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fetch_var.py b/python/paddle/fluid/tests/unittests/test_fetch_var.py index 46c3bbb6712c6276e48dd9328d7741a447f28b91..e6f37f0b4ca781e4ec83a00f8f2605ef02716bd7 100644 --- a/python/paddle/fluid/tests/unittests/test_fetch_var.py +++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py @@ -26,7 +26,7 @@ class TestFetchVar(op_test.OpTest): layers.assign(input=val, output=x) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_main_program(), feed={}, fetch_list=[]) - fetched_x = fluid.fetch_var("x") + fetched_x = fluid.executor._fetch_var("x") self.assertTrue( numpy.array_equal(fetched_x, val), "fetch_x=%s val=%s" % (fetched_x, val)) diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py index c51a48239330621d8e008415f81361616467cabf..0997afc97a97333c914a3027103ec48733b410dc 100644 --- a/python/paddle/fluid/tests/unittests/test_program.py +++ b/python/paddle/fluid/tests/unittests/test_program.py @@ -17,6 +17,7 @@ import unittest from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name import paddle.fluid.layers as layers +import paddle.fluid as fluid main_program = default_main_program() @@ -98,6 +99,39 @@ class TestProgram(unittest.TestCase): new_program = main_program.clone() self.assertNotEqual(0, len(new_program.blocks[0].all_parameters())) + def test_program_inference_optimize(self): + def net(): + reader = fluid.layers.py_reader( + capacity=10, + shapes=[[-1, 10], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64'], + use_double_buffer=True) + in_data, label = fluid.layers.read_file(reader) + predict_label = fluid.layers.fc(in_data, size=2, act='softmax') + loss = fluid.layers.mean( + fluid.layers.cross_entropy( + input=predict_label, label=label)) + + optimizer = fluid.optimizer.Adam() + optimizer.minimize(loss) + + startup_program = fluid.Program() + main_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + net() + no_read_program = main_program.inference_optimize() + keep_read_program = main_program.inference_optimize( + export_for_deployment=False) + no_read_ops = no_read_program.global_block().ops + keep_read_ops = keep_read_program.global_block().ops + self.assertEqual(len(keep_read_ops) - len(no_read_ops), 2) + self.assertEqual(keep_read_ops[0].type, 'create_double_buffer_reader') + self.assertEqual(keep_read_ops[1].type, 'read') + + for i in range(len(no_read_ops)): + self.assertEqual(no_read_ops[i].type, keep_read_ops[i + 2].type) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index 621dd681345d2c185c0644de917b8bcd529f9937..9853fb4e9a89944bfdf2954e3d3d86fef92ac93c 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -68,7 +68,7 @@ class TestOpDesc(unittest.TestCase): self.assertEqual(8, len(op.attr_names())) op.set_block_attr("block_attr", program_desc.block(0)) - self.assertEqual(0, op.block_attr("block_attr")) + self.assertEqual(0, op.block_attr_id("block_attr")) mul_op = block.append_op() mul_op.set_type("mul") diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py index 91b1fd2af7d8aaf85d17965f8b02c35ee3990291..f9bda5e4701f693f41fe7041ba0f5ec80b6fc31c 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py @@ -62,7 +62,8 @@ class TestPyReader(unittest.TestCase): next_data = np.random.uniform( low=0, high=1000, size=(batch_size, ) + shape[1:]).astype(dtype) - in_data.append(executor.as_lodtensor(next_data)) + in_data.append( + fluid.executor._as_lodtensor(next_data, place)) self.inputs.append(in_data) diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index 17ab875f6a555c99bbe480a54ea2df3b1f60de2d..868a0248be6833d0e8fed8a26549352562c279c1 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -22,7 +22,7 @@ pos_enc_param_names = ( "src_pos_enc_table", "trg_pos_enc_table", ) -batch_size = 64 +batch_size = 2 def position_encoding_init(n_position, d_pos_vec): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 63422a6ec5642fbf6568371b846f3aa022a7afc6..ff6e71bbfaf5aeb91150a8d4a48fa64be4421373 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -584,12 +584,12 @@ class DistributeTranspiler(object): if op.type in [ "gaussian_random", "fill_constant", "uniform_random" ]: - op.attrs["shape"] = new_outputs["Out"].shape + op.set_attr("shape", list(new_outputs["Out"].shape)) s_prog.global_block().append_op( type=op.type, inputs=new_inputs, outputs=new_outputs, - attrs=op.attrs) + attrs=op.all_attrs()) return s_prog # ====================== private transpiler functions ===================== @@ -603,7 +603,7 @@ class DistributeTranspiler(object): self.table_name = None for op in self.origin_program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: - if op.attrs['is_distributed'] is True: + if op.attr('is_distributed') is True: if self.table_name is None: self.table_name = op.input("W")[0] if self.table_name != op.input("W")[0]: @@ -749,14 +749,14 @@ class DistributeTranspiler(object): out_name = op.output("Out") ids_var = program.global_block().vars[ids_name[0]] - prefetch_input_vars = self.create_splited_vars( + prefetch_input_vars = self._create_splited_vars( source_var=ids_var, block=program.global_block(), tag="_prefetch_in_") self.all_prefetch_input_vars.append(prefetch_input_vars) out_var = program.global_block().vars[out_name[0]] - prefetch_output_vars = self.create_splited_vars( + prefetch_output_vars = self._create_splited_vars( source_var=out_var, block=program.global_block(), tag="_prefetch_out_") @@ -1038,7 +1038,7 @@ class DistributeTranspiler(object): program.global_block()._sync_with_cpp() return var_mapping - def create_splited_vars(self, source_var, block, tag): + def _create_splited_vars(self, source_var, block, tag): return [ block.create_var( name=str(source_var.name + tag + str(index)), @@ -1182,18 +1182,39 @@ class DistributeTranspiler(object): program = optimize_block.program pserver_block = program.global_block() new_inputs = dict() + # update param/grad shape first, then other inputs like # moment can use the updated shape + def _get_param_block(opt_op): + # param is already created on global program + param_block = None + for p in self.param_grad_ep_mapping[endpoint]["params"]: + if same_or_split_var(p.name, opt_op.input("Param")[0]): + param_block = p + break + return param_block + for key in opt_op.input_names: if key == "Grad": new_inputs[key] = merged_var + # For RMSProp optimizer + elif key == "Moment" or key == "MeanSquare": + param_block = _get_param_block(opt_op) + if not param_block: + return + moment_var = origin_program.global_block().vars[opt_op.input( + key)[0]] + tmpvar = pserver_block.create_var( + name=moment_var.name, + persistable=moment_var.persistable, + dtype=moment_var.dtype, + # change to use same shape as param + # TODO(typhoonzero): didn't append .block in the var name, + # may affect checkpoint saving? Need to verify. + shape=param_block.shape) + new_inputs[key] = tmpvar elif key == "Param": - # param is already created on global program - param_block = None - for p in self.param_grad_ep_mapping[endpoint]["params"]: - if same_or_split_var(p.name, opt_op.input(key)[0]): - param_block = p - break + param_block = _get_param_block(opt_op) if not param_block: return tmpvar = pserver_block.create_var( @@ -1219,7 +1240,7 @@ class DistributeTranspiler(object): for key in opt_op.input_names: new_shape = None - if key in ["Param", "Grad", "LearningRate"]: + if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]: continue var = self.origin_program.global_block().vars[opt_op.input(key)[0]] # update accumulator variable shape @@ -1242,7 +1263,7 @@ class DistributeTranspiler(object): type=opt_op.type, inputs=new_inputs, outputs=outputs, - attrs=opt_op.attrs) + attrs=opt_op.all_attrs()) def _is_splited_grad_var(self, var, var_dict): grad_block = None @@ -1275,7 +1296,7 @@ class DistributeTranspiler(object): block._clone_variable(var) return block.append_op( - type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs) + type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs()) def _append_pserver_non_opt_ops(self, optimize_block, opt_op): program = optimize_block.program @@ -1316,7 +1337,7 @@ class DistributeTranspiler(object): type=opt_op.type, inputs=inputs, outputs=outputs, - attrs=opt_op.attrs) + attrs=opt_op.all_attrs()) def _is_op_connected(self, op1, op2): # If one op's input is another op's output or @@ -1421,8 +1442,8 @@ class DistributeTranspiler(object): # optimize op_maker = core.op_proto_and_checker_maker optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize - if op_maker.kOpRoleAttrName() in op.attrs and \ - int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role): + if op_maker.kOpRoleAttrName() in op.attr_names and \ + int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role): return True return False @@ -1445,8 +1466,8 @@ class DistributeTranspiler(object): # and op_role_var to get the pair. for input_name in op.input_arg_names: if input_name.find("@GRAD") != -1 and \ - op.attrs[RPC_OP_ROLE_ATTR_NAME]: - param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0] + op.attr(RPC_OP_ROLE_ATTR_NAME): + param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0] params_grads.append([ origin_var_dict[param_name], origin_var_dict[input_name] diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index 142fa5c31d2c558e482001da73fa26d8396c3967..87f20bbccf3138585841952efacef5b0a3cbbace 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -57,10 +57,10 @@ class InferenceTranspiler(object): scope = global_scope() if not isinstance(scope, core.Scope): raise TypeError("scope should be as Scope type or None") - self.fuse_batch_norm(program, place, scope) - self.fuse_relu_mkldnn(program) + self._fuse_batch_norm(program, place, scope) + self._fuse_relu_mkldnn(program) - def fuse_relu_mkldnn(self, program): + def _fuse_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. @@ -104,7 +104,7 @@ class InferenceTranspiler(object): # And a better solution will be considered later. program = program.clone() - def fuse_batch_norm(self, program, place, scope): + def _fuse_batch_norm(self, program, place, scope): ''' Transpile the program by fused batch normalization. diff --git a/python/requirements.txt b/python/requirements.txt index c091ecb111bda9d5e83c3ddcae93aed0745f9e4c..f8298a63612cb217ce0e711e78fffdf86b73313d 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,5 +1,5 @@ requests==2.9.2 -numpy>=1.12 +numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version protobuf==3.1 recordio>=0.1.0 matplotlib