diff --git a/CMakeLists.txt b/CMakeLists.txt index f56c5d382af8cdfb5a941ee272a0f8d22ec04d67..920c20d6f813c14df8e02593b1c4a5a13cc11ef0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,12 +204,11 @@ include(external/snappy) # download snappy include(external/snappystream) include(external/threadpool) +set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE) if(WITH_GPU) include(cuda) include(tensorrt) include(external/anakin) -else() - set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE) endif() include(cudnn) # set cudnn libraries, must before configure diff --git a/cmake/configure.cmake b/cmake/configure.cmake index e4af34d10ed92c501dd805addb62747c91c00978..c35096e09b5685ee30f20648e7dd461f71e0b1c4 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -97,6 +97,14 @@ if(WITH_GPU) endif() include_directories(${TENSORRT_INCLUDE_DIR}) endif() + if(WITH_ANAKIN) + if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) + message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile") + endif() + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile") + endif() + endif() elseif(WITH_AMD_GPU) add_definitions(-DPADDLE_WITH_HIP) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index 8b7d91f234594becdda805c089fac0bb4e4e8e44..403873a51017b9bb7c888bed77d89ca9b15b68d2 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -2,10 +2,22 @@ if (NOT WITH_ANAKIN) return() endif() -set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH - "Anakin install path." FORCE) -set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files") -set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") +INCLUDE(ExternalProject) +set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin) +# the anakin install dir is only default one now +set(ANAKIN_INSTALL_DIR ${THIRD_PARTY_PATH}/anakin/src/extern_anakin/output) +set(ANAKIN_INCLUDE ${ANAKIN_INSTALL_DIR}) +set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR}) +set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so) +set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so) + +# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later. +set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin") +execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}") +execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}") + +include_directories(${ANAKIN_INCLUDE}) +include_directories(${ANAKIN_INCLUDE}/saber/) set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable @@ -20,36 +32,33 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-reorder -Wno-error=cpp) -set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz") - -# A helper function used in Anakin, currently, to use it, one need to recursively include -# nearly all the header files. -function(fetch_include_recursively root_dir) - if (IS_DIRECTORY ${root_dir}) - include_directories(${root_dir}) - endif() - - file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) - foreach(sub ${ALL_SUB}) - if (IS_DIRECTORY ${root_dir}/${sub}) - fetch_include_recursively(${root_dir}/${sub}) - endif() - endforeach() -endfunction() - -if (NOT EXISTS "${ANAKIN_INSTALL_DIR}") - # download library - message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}") - execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") - execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*") - execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}") - execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") - execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz") -endif() +ExternalProject_Add( + extern_anakin + ${EXTERNAL_PROJECT_LOG_ARGS} + # TODO(luotao): use PaddlePaddle/Anakin later + GIT_REPOSITORY "https://github.com/luotao1/Anakin" + GIT_TAG "3957ae9263eaa0b1986758dac60a88852afb09be" + PREFIX ${ANAKIN_SOURCE_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DUSE_GPU_PLACE=YES + -DUSE_X86_PLACE=YES + -DBUILD_WITH_UNIT_TEST=NO + -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf + -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml + -DCUDNN_ROOT=${CUDNN_ROOT} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} +) -if (WITH_ANAKIN) - message(STATUS "Anakin for inference is enabled") - message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") - fetch_include_recursively(${ANAKIN_INCLUDE}) - link_directories(${ANAKIN_LIBRARY}) -endif() +message(STATUS "Anakin for inference is enabled") +message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") + +add_library(anakin_shared SHARED IMPORTED GLOBAL) +set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB}) +add_dependencies(anakin_shared extern_anakin protobuf mklml) + +add_library(anakin_saber SHARED IMPORTED GLOBAL) +set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) +add_dependencies(anakin_saber extern_anakin protobuf mklml) + +list(APPEND external_project_dependencies anakin_shared anakin_saber) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index aeb081e76e5bc5b9d3d81ce625195c800174ab6c..834ab5a9e527355d3664313d38cd4920f6fbf535 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU) copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api SRCS ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api - ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release + ${ANAKIN_INSTALL_DIR} # anakin release DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin) list(APPEND inference_deps anakin_inference_lib) endif() diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst index db2842e7f23e74130a966bb347004bee1ccb08fd..f23ecc1f80030f20359ce9675130a167722606c9 100644 --- a/doc/fluid/api/executor.rst +++ b/doc/fluid/api/executor.rst @@ -38,11 +38,3 @@ _switch_scope .. autofunction:: paddle.fluid.executor._switch_scope :noindex: -.. _api_fluid_executor_fetch_var: - -fetch_var ---------- - -.. autofunction:: paddle.fluid.executor.fetch_var - :noindex: - diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst index 51cdfe0c2ed045a5b3247c4fdec9868d756eae86..7eab58355c3648d929d3b5d98984adce9034f016 100644 --- a/doc/fluid/api/fluid.rst +++ b/doc/fluid/api/fluid.rst @@ -106,22 +106,6 @@ _switch_scope .. autofunction:: paddle.fluid._switch_scope :noindex: -.. _api_fluid_fetch_var: - -fetch_var ---------- - -.. autofunction:: paddle.fluid.fetch_var - :noindex: - -.. _api_fluid_Go: - -Go --- - -.. autoclass:: paddle.fluid.Go - :members: - :noindex: .. _api_fluid_make_channel: diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 21a82aaa4ad884a4fe7b992781027ccd204b11ca..964be52ee9a9a30a427eab6951c8395d247153df 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) @@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) @@ -34,21 +37,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True)) -paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None) -paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -62,20 +54,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', ' paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False)) paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) paddle.fluid.InferenceTranspiler.__init__ -paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None) -paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None) paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0)) -paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None @@ -89,8 +77,8 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)) paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)) @@ -338,14 +326,11 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) paddle.fluid.transpiler.InferenceTranspiler.__init__ -paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h index 6e302a29233b96451df14b4685911be1cd87c1ab..c97b364de1ecae21e97351196389615187932b5e 100644 --- a/paddle/fluid/framework/details/exception_holder.h +++ b/paddle/fluid/framework/details/exception_holder.h @@ -14,6 +14,7 @@ #pragma once +#include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -22,27 +23,24 @@ namespace details { class ExceptionHolder { public: - void Catch(const platform::EnforceNotMet& exp) { - std::lock_guard lock(mu_); - exception_.reset(new platform::EnforceNotMet(exp)); - type_ = kEnforceNotMet; - } - - void Catch(const platform::EOFException& exp) { - std::lock_guard lock(mu_); - // EOFException will not cover up existing EnforceNotMet. - if (exception_.get() == nullptr) { - exception_.reset(new platform::EOFException(exp)); - type_ = kEOF; + void Catch(std::exception_ptr eptr) { + try { + std::rethrow_exception(eptr); + } catch (platform::EOFException exp) { + Catch(exp); + } catch (platform::EnforceNotMet exp) { + Catch(exp); + } catch (...) { + LOG(FATAL) << "Unknown exception caught"; } } - bool ExceptionCatched() const { + bool IsCaught() const { std::lock_guard lock(mu_); return exception_.get() != nullptr; } - void Throw() { + void ReThrow() { std::lock_guard lock(mu_); switch (type_) { case kNone: @@ -50,27 +48,41 @@ class ExceptionHolder { case kEnforceNotMet: { auto e = *static_cast(exception_.get()); throw e; - break; } case kEOF: { auto e = *static_cast(exception_.get()); throw e; - break; } - default: - LOG(FATAL) << "Unknown exception."; } - exception_.reset(); - type_ = kNone; + ClearImpl(); } void Clear() { std::lock_guard lock(mu_); + ClearImpl(); + } + + private: + void ClearImpl() { exception_.reset(); type_ = kNone; } - private: + void Catch(const platform::EnforceNotMet& exp) { + std::lock_guard lock(mu_); + exception_.reset(new platform::EnforceNotMet(exp)); + type_ = kEnforceNotMet; + } + + void Catch(const platform::EOFException& exp) { + std::lock_guard lock(mu_); + // EOFException will not cover up existing EnforceNotMet. + if (exception_.get() == nullptr) { + exception_.reset(new platform::EOFException(exp)); + type_ = kEOF; + } + } + enum ExceptionType { kNone, kEnforceNotMet, kEOF }; ExceptionType type_{kNone}; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 994bb6492f685138d02971a6caf12572aecd6d6f..c9e331ef359f853263f8dad38dd0a2be4d9618ad 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -107,11 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto cur_ready_vars = ready_vars.PopAll(1, &timeout); if (timeout) { - if (exception_holder_.ExceptionCatched()) { + if (exception_holder_.IsCaught()) { for (auto &run_op_future : run_op_futures_) { run_op_future.wait(); } - exception_holder_.Throw(); + exception_holder_.ReThrow(); } else { continue; } @@ -220,12 +220,8 @@ void ThreadedSSAGraphExecutor::RunOp( running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; - } catch (platform::EOFException ex) { - exception_holder_.Catch(ex); - } catch (platform::EnforceNotMet ex) { - exception_holder_.Catch(ex); } catch (...) { - LOG(FATAL) << "Unknown exception catched"; + exception_holder_.Catch(std::current_exception()); } }; if (pool_) { diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index c9d55fbf525a1a476ac469e8e57462169a7db2da..5736a5c4e232698085936303d1f23760649f8245 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -28,6 +28,38 @@ namespace paddle { namespace framework { namespace ir { +/* + * The graph is a Directed Acyclic Single Static Assignment Graph. + * + * In more detail, the following properties must hold: + * + * The graph shouldn't contain cycle. Each node is a black-box to the graph + * so the node itself could be a loop operator. + * + * Each Variable-type node has only one input (thus single static assignment). + * + * The output/input of operator is variable and the output/input of variable + * is operator. + * + * The following data harzards in Program are addressed in the Graph: + * + * Write-After-Read + * a = op1(x) + * x = op2(b) + * A control-dependency connection is created bettwen op1 and op2 such that + * op1->op2, so as to ensure correct order. + * + * Write-After-Write + * x = op1(a) + * x = op2(b) + * A control-dependency connection is created between op1 and op2 such that + * op1->op2, so as to ensure correct order. + * + * Other properties currently hold, but is not enforced yet: + * + * Variable-type node (not control dep) with the same variable name share + * the same underlying VarDesc. + */ class Graph { public: explicit Graph(const ProgramDesc &program); diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index f9e6bdf3625bdced9d1a9195a979b0f46016d8bf..b1b8d1c586c98a327a8e5b4890ced00022155e6b 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -36,7 +36,7 @@ class SumOpMaker : public OpProtoAndCheckerMaker { public: void Make() { AddInput("X", "").AsDuplicable(); - AddOutput("Out", ""); + AddOutput("Out", "").AsDuplicable(); AddComment(""); } }; @@ -59,11 +59,27 @@ class SumOpVarTypeInference : public VarTypeInference { block->Var(out_var_name)->SetType(default_var_type); } }; + +class DummyOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", "").AsDuplicable(); + AddComment(""); + } +}; + +class DummyOpVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc &op_desc, BlockDesc *block) const override {} +}; } // namespace framework } // namespace paddle REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker, paddle::framework::SumOpVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::NOP, paddle::framework::SumOpMaker, + paddle::framework::SumOpVarTypeInference); REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP, paddle::framework::SumOpMaker); @@ -110,5 +126,83 @@ TEST(GraphTest, Basic) { } ASSERT_EQ(nodes.size(), 5); } + +TEST(GraphTest, WriteAfterRead) { + // void Test() { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + op->SetAttr("op_role", 1); + + op = prog.MutableBlock(0)->AppendOp(); + op->SetType("dummy"); + op->SetInput("X", {"c"}); + op->SetOutput("Out", {"a"}); + op->SetAttr("op_role", 1); + + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + + std::unique_ptr g(new ir::Graph(prog)); + ir::Node *control_dep1 = nullptr; + ir::Node *control_dep2 = nullptr; + for (ir::Node *n : g->Nodes()) { + if (n->Name() == "sum") { + ASSERT_EQ(n->outputs[0]->Name(), "b"); + ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1])); + control_dep1 = n->outputs[1]; + ASSERT_EQ(n->outputs.size(), 2); + } + if (n->Name() == "dummy") { + ASSERT_EQ(n->inputs[0]->Name(), "c"); + ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1])); + control_dep2 = n->inputs[1]; + ASSERT_EQ(n->inputs.size(), 2); + } + } + ASSERT_EQ(control_dep1, control_dep2); +} + +TEST(GraphTest, WriteAfterWrite) { + // void Test() { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + op->SetAttr("op_role", 1); + + op = prog.MutableBlock(0)->AppendOp(); + op->SetType("dummy"); + op->SetInput("X", {"c"}); + op->SetOutput("Out", {"b"}); + op->SetAttr("op_role", 1); + + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + + std::unique_ptr g(new ir::Graph(prog)); + ir::Node *control_dep1 = nullptr; + ir::Node *control_dep2 = nullptr; + for (ir::Node *n : g->Nodes()) { + if (n->Name() == "sum") { + ASSERT_EQ(n->outputs[0]->Name(), "b"); + ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1])); + ASSERT_EQ(n->outputs.size(), 2); + control_dep1 = n->outputs[1]; + } + if (n->Name() == "dummy") { + ASSERT_EQ(n->inputs[0]->Name(), "c"); + ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1])); + control_dep2 = n->inputs[1]; + ASSERT_EQ(n->inputs.size(), 2); + ASSERT_EQ(control_dep1, control_dep2); + } + } +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index a190199f1cb1361f67f20c755b8e7ef52c284adc..03f7e71c03b8dd75d2a47cb4c6d1ef1a71792cf3 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const { } } -int OpDesc::GetBlockAttr(const std::string &name) const { +std::vector OpDesc::GetBlocksAttrIds(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + auto blocks = boost::get>(it->second); + + std::vector ids; + for (auto n : blocks) { + ids.push_back(n->ID()); + } + + return ids; +} + +int OpDesc::GetBlockAttrId(const std::string &name) const { auto it = attrs_.find(name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); return boost::get(it->second)->ID(); diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 74dd8ec002005dd080424b48b5db1a2574a6974f..b77d84125a23b81c3de4123bea6f0e09cd6d1e90 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -83,7 +83,9 @@ class OpDesc { Attribute GetNullableAttr(const std::string &name) const; - int GetBlockAttr(const std::string &name) const; + int GetBlockAttrId(const std::string &name) const; + + std::vector GetBlocksAttrIds(const std::string &name) const; void Rename(const std::string &old_name, const std::string &new_name); diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 1e01a6e900404990e16674755367d2fc6d832725..20bdc7830f32564448a69e9cd76c02585b7a1aca 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { for (const std::string &attr_name : op->AttrNames()) { if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) { int sub_block_id = - o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name); + o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name); op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); } } diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index c7286dacf01659f3af0927a71856e5a6496cb877..56bb9142dabe0d5546e321e675a5acba7bf4d306 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -112,5 +112,6 @@ Tensor& Tensor::Resize(const DDim& dims) { const DDim& Tensor::dims() const { return dims_; } int64_t Tensor::numel() const { return product(dims_); } + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 7f678f869aac4616c8bca440d0431f765da41dd6..b7b62eef23ec351686378c913d18fc72308fd7b2 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -59,6 +59,14 @@ inline T* Tensor::mutable_data(platform::Place place) { } inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { + int rank = src.dims().size(); + PADDLE_ENFORCE_GE( + rank, 2, + "'ReshapeToMatrix()' is only used for flatten high rank " + "tensors to matrixs. Can not be used in reshaping vectors."); + if (rank == 2) { + return src; + } Tensor res; res.ShareDataWith(src); res.Resize(flatten_to_2d(src.dims(), num_col_dims)); diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index c4ab26a2288bb9d8f3cd54a797d2062e0606b219..9318f1089781b30468cf4d3c7151d0dd26e50a9c 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -44,13 +44,13 @@ class DfgPassManagerImpl final : public DfgPassManager { if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { auto trt_teller = [&](const Node* node) { std::unordered_set teller_set( - {"elementwise_add", "mul", "conv2d", "pool2d", "relu"}); + {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"}); if (!node->IsFunction()) return false; const auto* func = static_cast(node); - if (teller_set.count(func->func_type())) + if (teller_set.count(func->func_type())) { return true; - else { + } else { return false; } }; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 08d0f493ab30d92a121d089d9003bc575429b4dd..83867e0a2c198f265cb36be3a71546796dfbec67 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -45,7 +45,6 @@ endfunction(inference_api_test) cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor) - cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -62,22 +61,18 @@ inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_e endif() if (WITH_ANAKIN) # only needed in CI - # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, - # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to - # compile the libinference_anakin_api.a and compile with anakin.so. - fetch_include_recursively(${ANAKIN_INCLUDE}) # compile the libinference_anakin_api.a and anakin.so. - nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc) - nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc) - target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) - target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) - target_link_libraries(inference_anakin_api anakin anakin_saber_common) - target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) + nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) + #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin) + function(anakin_target target_name) + target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + endfunction() + anakin_target(inference_anakin_api) + #anakin_target(inference_anakin_api_shared) if (WITH_TESTING) - # this test is unstable, disable it first. - #cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc - #ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin - #DEPS inference_anakin_api_shared) - #target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) - endif(WITH_TESTING) + cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc + ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin + DEPS inference_anakin_api dynload_cuda SERIAL) + target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + endif(WITH_TESTING) endif() diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 08d7af6d3af7054061b15b904c69b2862c629562..e31c637e969f7a86f4f185abb4f0f01d3303db75 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -22,6 +22,9 @@ limitations under the License. */ #include #include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_bool(profile, false, "Turn on profiler for fluid"); namespace paddle { namespace { @@ -58,6 +61,15 @@ bool NativePaddlePredictor::Init( std::shared_ptr parent_scope) { VLOG(3) << "Predictor::init()"; + if (FLAGS_profile) { + LOG(WARNING) << "Profiler is actived, might affect the performance"; + LOG(INFO) << "You can turn off by set gflags '-profile false'"; + + auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll + : platform::ProfilerState::kCPU; + platform::EnableProfiler(tracking_device); + } + if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); } else { @@ -102,6 +114,10 @@ bool NativePaddlePredictor::Init( } NativePaddlePredictor::~NativePaddlePredictor() { + if (FLAGS_profile) { + platform::DisableProfiler(platform::EventSortingKey::kTotal, + "./profile.log"); + } if (sub_scope_) { scope_->DeleteScope(sub_scope_); } diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index b24414e8245b1a4d90acce4fa1ad5690e06b47dd..794534467be066e91db2b4c204913ab2cf12dbfd 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -45,7 +45,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 8f42a37cd3f8978b917b42e8f45a128b8422aa57..6863b035d8cd9dfb21aed3947226a796778912a4 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,7 +1,7 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc -activation_op.cc +activation_op.cc softmax_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -21,3 +21,6 @@ nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL) + +nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0064f90fd7944403c14d4d47616ea82f681ceb74 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights. + */ +class SoftMaxOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) + << "convert a fluid softmax op to tensorrt softmax layer without bias"; + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax, + *const_cast(input1)); + + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(softmax); +REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..503ce71f7fb4377bb4304569b7484fb25abdb284 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(SoftMaxOpConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(8, parameters, scope, 1000); + + std::vector tensor_shape{8, 10}; + validator.DeclInputVar("softmax-X", tensor_shape, + nvinfer1::DimsCHW(10, 1, 1)); + validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("softmax"); + desc.SetInput("X", {"softmax-X"}); + desc.SetOutput("Out", {"softmax-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(3); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(softmax); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 63c2f978f253df11100ecca83acae5eab6a0337d..4265f33f28fe36b1745baf4761c3c85e3a281d6b 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -79,6 +79,12 @@ class TRTConvertValidation { } // Declare a Variable as input with random initialization. + void DeclInputVar(const std::string& name, const std::vector tensor_dims, + const nvinfer1::Dims& trt_dims) { + DeclVar(name, tensor_dims); + engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims); + } + void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) { DeclVar(name, dims); // Declare TRT inputs. @@ -94,12 +100,18 @@ class TRTConvertValidation { DeclVar(name, dims); } - // Declare a variable in a fluid Scope. - void DeclVar(const std::string& name, const nvinfer1::Dims& dims, - bool is_param = false) { + void DeclVar(const std::string& name, const std::vector dim_vec) { platform::CPUPlace place; platform::CPUDeviceContext ctx(place); + auto* x = scope_.Var(name); + auto* x_tensor = x->GetMutable(); + x_tensor->Resize(framework::make_ddim(dim_vec)); + RandomizeTensor(x_tensor, place, ctx); + } + // Declare a variable in a fluid Scope. + void DeclVar(const std::string& name, const nvinfer1::Dims& dims, + bool is_param = false) { // Init Fluid tensor. std::vector dim_vec(dims.d, dims.d + dims.nbDims); // There is no batchsize in ITensor's shape, but We should add it to @@ -107,10 +119,8 @@ class TRTConvertValidation { // if_add_batch_ flag is true, add the max batchsize to dim_vec. if (is_param != true && if_add_batch_ == true) dim_vec.insert(dim_vec.begin(), max_batch_size_); - auto* x = scope_.Var(name); - auto* x_tensor = x->GetMutable(); - x_tensor->Resize(framework::make_ddim(dim_vec)); - RandomizeTensor(x_tensor, place, ctx); + + DeclVar(name, dim_vec); } void SetOp(const framework::proto::OpDesc& desc) { diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4c3b8ec78190723598a56f7633764f10dd5047f3..ff0e989464e76b0f7cb163bd95997b82303036d6 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -235,7 +235,12 @@ else() endif() op_library(cross_entropy_op DEPS cross_entropy) -op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) +if(WITH_GPU) + op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub) +else() + op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) +endif() + op_library(softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) if (WITH_GPU AND TENSORRT_FOUND) @@ -273,9 +278,9 @@ op_library(squeeze_op DEPS reshape_op) op_library(extract_rows_op DEPS memory) op_library(flatten_op DEPS reshape_op) - if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) + op_library(layer_norm_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index 5b5a220cf90e7813f914ae35733e7a4103391b2d..a2a871efa850df5101be7c27ebd81456acace7e1 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -188,6 +188,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(crop_grad, ops::CropOpGrad); -REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CPU_KERNEL( + crop, ops::CropKernel); REGISTER_OP_CPU_KERNEL( crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu index 1a391860463dba14ad0de755ceb659bc9f64adc9..b75678217e36aa2297c68a7f8e2a9dfafadaca72 100644 --- a/paddle/fluid/operators/crop_op.cu +++ b/paddle/fluid/operators/crop_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/crop_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop, ops::CropKernel); REGISTER_OP_CUDA_KERNEL( crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index 772e80bbea4f2db654cefd0dcb404bc33803bd7a..2d7d33bd4f9b42b644444912570375bad92ba6c2 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -58,32 +58,74 @@ static std::vector GetOffsets(const framework::ExecutionContext& ctx) { return res; } -template +template +void CropFunction(const framework::ExecutionContext& context) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + auto out_dims = out->dims(); + if (out_dims[0] == -1) { + out_dims[0] = x->dims()[0]; + } + out->mutable_data(out_dims, context.GetPlace()); + auto x_stride = framework::stride(x->dims()); + auto out_stride = framework::stride(out->dims()); + auto offsets = GetOffsets(context); + int64_t offset = 0; + for (size_t i = 0; i < offsets.size(); ++i) { + offset += (x_stride[i] * offsets[i]); + } + + auto x_tensor = EigenTensor::From(*x); + auto out_tensor = EigenTensor::From(*out); + Eigen::array e_offsets; + Eigen::array e_shape; + for (size_t i = 0; i < D; ++i) { + e_offsets[i] = offsets[i]; + e_shape[i] = out->dims()[i]; + } + auto& place = + *context.template device_context().eigen_device(); + out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape); +} + +template class CropKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - const T* x_data = x->data(); - T* out_data = out->mutable_data(context.GetPlace()); - auto x_stride = framework::stride(x->dims()); - auto out_stride = framework::stride(out->dims()); - auto offsets = GetOffsets(context); - int64_t offset = 0; - for (size_t i = 0; i < offsets.size(); ++i) { - offset += (x_stride[i] * offsets[i]); + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + CropFunction(context); + break; + case 2: + CropFunction(context); + break; + case 3: + CropFunction(context); + break; + case 4: + CropFunction(context); + break; + case 5: + CropFunction(context); + break; + case 6: + CropFunction(context); + break; + default: + PADDLE_THROW( + "CropOp only support tensors with no more than 6 dimensions."); } - StridedMemcpy(context.device_context(), x_data + offset, x_stride, - out->dims(), out_stride, out_data); } }; template void CropGradFunction(const framework::ExecutionContext& context) { auto* d_x = context.Output(framework::GradVarName("X")); + auto* x = context.Input("X"); if (d_x != nullptr) { auto* d_out = context.Input(framework::GradVarName("Out")); - d_x->mutable_data(context.GetPlace()); + d_x->mutable_data(x->dims(), context.GetPlace()); auto offsets = GetOffsets(context); Eigen::array, D> paddings; for (size_t i = 0; i < D; ++i) { diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index a3bec3da45136bca5cb2763e7ffd6b67703a1813..578ab63bc380ee62d76e34b7cf3cbd590bfa2eda 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -28,23 +28,26 @@ class CrossEntropyOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2."); - PADDLE_ENFORCE_EQ(label_dims.size(), 2UL, - "Input(Label)'s rank should be 2."); - PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], - "The 1st dimension of Input(X) and Input(Label) should " - "be equal."); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(rank, label_dims.size(), + "Input(X) and Input(Label) shall have the same rank."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape " + "except the last dimension."); if (ctx->Attrs().Get("soft_label")) { - PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1], - "If Attr(soft_label) == true, the 2nd dimension of " + PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], + "If Attr(soft_label) == true, the last dimension of " "Input(X) and Input(Label) should be equal."); } else { - PADDLE_ENFORCE_EQ(label_dims[1], 1UL, - "If Attr(softLabel) == false, the 2nd dimension of " + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL, + "If Attr(softLabel) == false, the last dimension of " "Input(Label) should be 1."); } - ctx->SetOutputDim("Y", {x_dims[0], 1}); + auto y_dims = x_dims; + y_dims[rank - 1] = 1; + ctx->SetOutputDim("Y", y_dims); ctx->ShareLoD("X", /*->*/ "Y"); } @@ -74,24 +77,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto label_dims = ctx->GetInputDim("Label"); auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); - PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); - PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2."); - PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2."); - PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], - "The 1st dimension of Input(X) and Input(Label) should " - "be equal."); - PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0], - "The 1st dimension of Input(X) and Input(Y@Grad) should " - "be equal."); - PADDLE_ENFORCE_EQ(dy_dims[1], 1, - "The 2nd dimension of Input(Y@Grad) should be 1."); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(dy_dims.size(), rank, + "Input(Y@Grad) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(label_dims.size(), rank, + "Input(Label) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "The Input(X) and Input(Label) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(dy_dims, 0, rank - 1), + "The Input(X) and Input(Y@Grad) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, + "The last dimension of Input(Y@Grad) should be 1."); if (ctx->Attrs().Get("soft_label")) { - PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1], - "When Attr(soft_label) == true, the 2nd dimension of " + PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], + "When Attr(soft_label) == true, the last dimension of " "Input(X) and Input(Label) should be equal."); } else { - PADDLE_ENFORCE_EQ(label_dims[1], 1, - "When Attr(soft_label) == false, the 2nd dimension of " + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1, + "When Attr(soft_label) == false, the last dimension of " "Input(Label) should be 1."); } ctx->SetOutputDim(framework::GradVarName("X"), x_dims); @@ -113,18 +120,20 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(Tensor, default Tensor), a 2-D tensor with shape [N x D]," - " where N is the batch size and D is the number of classes. " - "This input is a probability computed by the previous operator, " - "which is almost always the result of a softmax operator."); - AddInput("Label", - "(Tensor), the ground truth which is a 2-D tensor. When " - "soft_label is set to false, Label is a Tensor with shape " - "[N x 1]. When soft_label is set to true, Label is a " - "Tensor with shape [N x D]."); + "(Tensor, default Tensor), a tensor whose last dimension " + "size is equal to the number of classes. This input is a " + "probability computed by the previous operator, which is almost " + "always the result of a softmax operator."); + AddInput( + "Label", + "(Tensor), the tensor which represents the ground truth. It has the " + "same shape with 'X' except the last dimension. When soft_label is set " + "to false, the last dimension size is 1; when soft_label is set to " + "true, the last dimension size is equal to the number of classes."); AddOutput("Y", - "(Tensor, default Tensor), a 2-D tensor with shape " - "[N x 1]. The cross entropy loss."); + "(Tensor, default Tensor), a tensor whose shape is same " + "with 'X' except that the last dimension size is 1. It " + "represents the cross entropy loss."); AddAttr("soft_label", "(bool, default false), a flag indicating whether to " "interpretate the given labels as soft labels.") @@ -132,6 +141,12 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( CrossEntropy Operator. +The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. +The matrix's second dimension(row length) is as same as the original last +dimension, and the first dimension(column length) is the product of all other +original dimensions. Then the softmax computation will take palce on each raw +of flattened matrixs. + It supports both standard cross-entropy and soft-label cross-entropy loss computation. 1) One-hot cross-entropy: diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 19a2aec92b267ece94685ce34604b7d1cfa5d209..36b58d80144d242277f6fc970a3a61a6721d4b50 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -33,8 +33,13 @@ class CrossEntropyOpKernel : public framework::OpKernel { auto* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); + int rank = x->dims().size(); + Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); + Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1); + Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); + math::CrossEntropyFunctor()( - ctx.template device_context(), y, x, labels, + ctx.template device_context(), &y_2d, &x_2d, &labels_2d, ctx.Attr("soft_label")); } }; @@ -98,9 +103,12 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { auto* dy = ctx.Input(framework::GradVarName("Y")); auto* label = ctx.Input("Label"); auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dx_data = dx->mutable_data(ctx.GetPlace()); + T* dx_data = dx->mutable_data(ctx.GetPlace()); - int64_t class_num = x->dims()[1]; + // Following computation only depends on the last dimension size. So it's + // unnecessary to convert tensors to 2-D views. + int rank = x->dims().size(); + int64_t class_num = x->dims()[rank - 1]; if (ctx.Attr("soft_label")) { XeSoftlabelGradFunctor functor(dx_data, dy->data(), x->data(), label->data(), diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc index d4a09bae3a98e4518f9885c1e9182f7033a0d262..54a4b87ec8f13c4d474aad4cc0b8159cd5f59d1c 100644 --- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc +++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc @@ -227,6 +227,9 @@ class MineHardExamplesOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT( neg_pos_ratio, 0.0f, "neg_pos_ratio must greater than zero in max_negative mode"); + PADDLE_ENFORCE_LT( + neg_dist_threshold, 1.0f, + "neg_dist_threshold must less than one in max_negative mode"); PADDLE_ENFORCE_GT( neg_dist_threshold, 0.0f, "neg_dist_threshold must greater than zero in max_negative mode"); diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu index 6cbf6066c92b02eb75922587f9da5192bed15580..dfff518f170b56d180b6883c363effb8dbd677b6 100644 --- a/paddle/fluid/operators/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise_add_op.cu @@ -16,60 +16,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise_add_op.h" #include "paddle/fluid/platform/float16.h" -namespace paddle { -namespace operators { - -template -__global__ void ElementwiseAddCUDAKernel(const T *x, const T *y, T *z, int n, - int post, int size) { - int idx_x = threadIdx.x + blockIdx.x * blockDim.x; - if (idx_x < size) { - int idx_y = idx_x / post - (idx_x / (n * post)) * n; - z[idx_x] = x[idx_x] + y[idx_y]; - } -} - -template -class ElementwiseAddKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using Tensor = framework::Tensor; - - const auto x = ctx.Input("X"); - const auto y = ctx.Input("Y"); - auto z = ctx.Output("Out"); - auto *z_data = z->mutable_data(ctx.GetPlace()); - - auto &device = *(ctx.cuda_device_context().eigen_device()); - const framework::DDim &x_dim = x->dims(); - framework::DDim y_dim = y->dims(); - int size = x->numel(); - if (x_dim == y_dim) { - auto dim = framework::make_ddim({size}); - auto z_eigen = framework::EigenTensor::From(*z, dim); - auto x_eigen = framework::EigenTensor::From(*x, dim); - auto y_eigen = framework::EigenTensor::From(*y, dim); - z_eigen.device(device) = x_eigen + y_eigen; - } else { - int axis = ctx.Attr("axis"); - axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); - y_dim = trim_trailing_singular_dims(y_dim); - axis = (y_dim.size() == 0) ? x_dim.size() : axis; - int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); - int threads = 512; - int grids = (size + threads - 1) / threads; - auto stream = ctx.cuda_device_context().stream(); - ElementwiseAddCUDAKernel<<>>( - x->data(), y->data(), z_data, n, post, size); - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h index 0b19723720171a857c946880c246e2247a0023a7..5356105e2e551c0528694091608fc7585dce66d2 100644 --- a/paddle/fluid/operators/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise_add_op.h @@ -144,41 +144,16 @@ class ElementwiseAddGradKernel : public framework::OpKernel { auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + // skip out, x, y + auto* out = dout; + auto *x = dout, *y = dout; - if (dx != nullptr) { - // In fact, we can just share memory, but it may cause a bug of memory - // optimizer - // dx->ShareDataWith(*dout); - framework::TensorCopy(*dout, ctx.GetPlace(), - ctx.template device_context(), dx); - } - - if (dy == nullptr) return; - - const framework::DDim& x_dim = dout->dims(); - framework::DDim y_dim = dy->dims(); - if (x_dim == y_dim) { - // dy->ShareDataWith(*dout); - framework::TensorCopy(*dout, ctx.GetPlace(), - ctx.template device_context(), dy); + if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr && + dy != nullptr && (dx->dims() == dy->dims())) { + elementwise_add_grad(ctx, x, y, out, dout, dx, dy); } else { - dy->mutable_data(ctx.GetPlace()); - // Perform reduction to dout to calculate dy - int axis = ctx.Attr("axis"); - axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); - y_dim = trim_trailing_singular_dims(y_dim); - axis = (y_dim.size() == 0) ? x_dim.size() : axis; - - auto& device = - *(ctx.template device_context().eigen_device()); - int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); - auto eigen_dout = framework::EigenTensor::From( - *dout, framework::make_ddim({pre, n, post})); - auto eigen_dy = - framework::EigenTensor::From(*dy, framework::make_ddim({n})); - eigen_dy.device(device) = eigen_dout.sum( - framework::EigenDim<2>::From(framework::make_ddim({0, 2}))); + default_elementwise_add_grad(ctx, x, y, out, dout, dx, + dy); } } }; diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index 6840e1e08f3d5bc84a05f15e30982c7cfb59680b..0886c41a1b582881faf24f5531d414db4e4db71c 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,512 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/operators/layer_norm_op.h" +namespace paddle { +namespace operators { + +inline static int GetDesiredBlockDim(int block_dim) { + const int kMaxBlockDim = 512; + return block_dim >= kMaxBlockDim + ? kMaxBlockDim + : (1 << (static_cast(std::log2f(block_dim)))); +} + +#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ + case (1 << (log2_block_dim)): { \ + constexpr auto kBlockDim = (1 << (log2_block_dim)); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM_CASE(...) \ + FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__) + +static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); } +static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); } + +template +struct PairForLayerNorm { + __device__ __forceinline__ PairForLayerNorm() {} + __device__ __forceinline__ PairForLayerNorm(const T &first, const T &second) + : first_(first), second_(second) {} + + T first_; + T second_; +}; + +template +struct PairForLayerNormAddFunctor { + __device__ __forceinline__ PairForLayerNorm operator()( + const PairForLayerNorm &p1, const PairForLayerNorm &p2) { + return PairForLayerNorm(p1.first_ + p2.first_, p1.second_ + p2.second_); + } +}; + +template +__global__ void LayerNormForward(const T *x, const T *scale, const T *bias, + T *y, T *mean, T *var, float epsilon, + int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + + int beg_idx = blockIdx.x * feature_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * feature_size; + + // Step 1: Reduce to calculate mean and var + T mean_val = static_cast(0); + T var_val = static_cast(0); + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T tmp = x[i]; + mean_val += tmp; + var_val += (tmp * tmp); + } + auto pair = BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(mean_val, var_val), + PairForLayerNormAddFunctor()); + if (threadIdx.x == 0) { + auto tmp = pair.first_ / feature_size; + mean[blockIdx.x] = tmp; + var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp; + } + __syncthreads(); + mean_val = mean[blockIdx.x]; + var_val = static_cast(real_sqrt(var[blockIdx.x] + epsilon)); + + // Step 2: Calculate y + if (scale != nullptr) { + if (bias != nullptr) { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = scale[j] * (x[i] - mean_val) / var_val + bias[j]; + } + } else { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = scale[j] * (x[i] - mean_val) / var_val; + } + } + } else { // scale == nullptr + if (bias != nullptr) { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = (x[i] - mean_val) / var_val + bias[j]; + } + } else { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = (x[i] - mean_val) / var_val; + } + } + } +} + +// Make sure that d_scale != nullptr && d_bias != nullptr +// Since d_scale != nullptr, scale would not be nullptr +template +__global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, + T *d_scale, T *d_bias, T *d_x, + const T *mean, const T *var, + const T *scale, float epsilon, + int batch_size, int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + + int beg_idx = threadIdx.x * feature_size + blockIdx.x; + int end_idx = batch_size * feature_size + blockIdx.x; + int stride = BlockDim * feature_size; + + T d_scale_partial = 0, d_bias_partial = 0; + + for (int i = beg_idx; i < end_idx; i += stride) { + int row_idx = i / feature_size; + auto var_val = static_cast(real_sqrt(var[row_idx] + epsilon)); + d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val; + d_bias_partial += d_y[i]; + if (HasDx) { + d_x[i] = d_y[i] * scale[blockIdx.x] / var_val; + } + } + + auto pair = BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(d_scale_partial, d_bias_partial), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + d_scale[blockIdx.x] = pair.first_; + d_bias[blockIdx.x] = pair.second_; + } +} + +// Make sure that there is only one true expression: d_scale != nullptr +// or d_bias != nullptr +// Notice: scale may be nullptr +template +__global__ void LayerNormBackwardGradientScaleOrBias( + const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean, + const T *var, const T *scale, float epsilon, int batch_size, + int feature_size) { + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + int beg_idx = threadIdx.x * feature_size + blockIdx.x; + int end_idx = batch_size * feature_size + blockIdx.x; + int stride = BlockDim * feature_size; + T d_scale_or_d_bias_partial = 0; + + for (int i = beg_idx; i < end_idx; i += stride) { + int row_idx = i / feature_size; + auto var_val = static_cast(real_sqrt(var[row_idx] + epsilon)); + if (HasDScale) { + d_scale_or_d_bias_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val; + } else { // d_bias != nullptr + d_scale_or_d_bias_partial += d_y[i]; + } + + if (HasDx) { + if (scale != nullptr) { + d_x[i] = d_y[i] * scale[blockIdx.x] / var_val; + } else { + d_x[i] = d_y[i] / var_val; + } + } + } + + d_scale_or_d_bias_partial = + BlockReduce(temp_storage).Reduce(d_scale_or_d_bias_partial, cub::Sum()); + + if (threadIdx.x == 0) { + if (HasDScale) { + d_scale[blockIdx.x] = d_scale_or_d_bias_partial; + } else { + d_bias[blockIdx.x] = d_scale_or_d_bias_partial; + } + } +} + +template +__global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x, + const T *mean, + const T *var, + float epsilon, + int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T d_x_reduce_tmp[2]; + + int beg_idx = blockIdx.x * feature_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * feature_size; + + T block_mean = mean[blockIdx.x]; + T block_var = var[blockIdx.x]; + T d_x_mean_partial = 0, d_x_var_partial = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + d_x_mean_partial += d_x[i]; + d_x_var_partial += d_x[i] * (x[i] - block_mean); + } + + auto pair = + BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + d_x_reduce_tmp[0] = pair.first_ / feature_size; + d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon)); + } + __syncthreads(); + + d_x_mean_partial = d_x_reduce_tmp[0]; + d_x_var_partial = d_x_reduce_tmp[1]; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + d_x[i] -= d_x_mean_partial; + d_x[i] -= (x[i] - block_mean) * d_x_var_partial; + } +} + +// Here, we only calculate d_x +template +__global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y, + T *d_x, const T *mean, + const T *var, const T *scale, + float epsilon, + int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T d_x_reduce_tmp[2]; + + int beg_idx = blockIdx.x * feature_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * feature_size; + + T block_mean = mean[blockIdx.x], block_var = var[blockIdx.x]; + T d_x_mean_partial = 0, d_x_var_partial = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + auto var_val = static_cast(real_sqrt(block_var + epsilon)); + if (scale != nullptr) { + int col_idx = i % feature_size; + d_x[i] = d_y[i] * scale[col_idx] / var_val; + } else { + d_x[i] = d_y[i] / var_val; + } + d_x_mean_partial += d_x[i]; + d_x_var_partial += d_x[i] * (x[i] - block_mean); + } + + auto pair = + BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + d_x_reduce_tmp[0] = pair.first_ / feature_size; + d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon)); + } + __syncthreads(); + + d_x_mean_partial = d_x_reduce_tmp[0]; + d_x_var_partial = d_x_reduce_tmp[1]; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + d_x[i] -= d_x_mean_partial; + d_x[i] -= (x[i] - block_mean) * d_x_var_partial; + } +} + +template +__global__ void LayerNormBackwardWhenBatchSizeIsOne( + const T *x, const T *d_y, T *d_x, T *d_scale, T *d_bias, const T *mean, + const T *var, const T *scale, float epsilon, int feature_size) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < feature_size) { + auto var_val = static_cast(real_sqrt(var[idx] + epsilon)); + if (d_x != nullptr) { + if (d_scale == nullptr) { + d_x[idx] = d_y[idx] / var_val; + } else { + d_x[idx] = d_y[idx] * scale[idx] / var_val; + } + } + + if (d_scale != nullptr) { + d_scale[idx] = d_y[idx] * (x[idx] - mean[idx]) / var_val; + } + + if (d_bias != nullptr) d_bias[idx] = d_y[idx]; + } +} + +template +static void LayerNormBackward(const T *x, const T *d_y, const T *scale, + const T *mean, const T *var, T *d_x, T *d_scale, + T *d_bias, float epsilon, int batch_size, + int feature_size, cudaStream_t stream) { + const int kMaxBlockDim = 512; + int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) | + ((d_scale != nullptr ? 1 : 0) << 1) | + ((d_bias != nullptr ? 1 : 0)); + if (gradient_flag == 0) return; + + if (batch_size == 1) { + LayerNormBackwardWhenBatchSizeIsOne< + T><<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0, + stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon, + feature_size); + + if (d_x != nullptr) { + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<<1, kBlockDim, 0, stream>>>( + x, d_x, mean, var, epsilon, feature_size)); + } + } + return; + } + + auto block_dim = GetDesiredBlockDim(batch_size); + switch (gradient_flag) { + case 1: // d_x == nulptr, d_scale == nullptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, false, + false><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + break; + case 2: // d_x == nullptr, d_scale != nullptr, d_bias == nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, false, + true><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + break; + case 3: // d_x == nullptr, d_scale != nulptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardGradientAll< + T, kBlockDim, false><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, + batch_size, feature_size)); + } + break; + case 4: // d_x != nullptr, d_scale == nullptr, d_bias == nullptr + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardGradientOnlyDX< + T, kBlockDim><<>>( + x, d_y, d_x, mean, var, scale, epsilon, feature_size)); + } + break; + case 5: // d_x != nulptr, d_scale == nullptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, true, + false><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<>>( + x, d_x, mean, var, epsilon, feature_size)); + } + break; + case 6: // d_x != nullptr, d_scale != nullptr, d_bias == nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, true, + true><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<>>( + x, d_x, mean, var, epsilon, feature_size)); + } + break; + case 7: // d_x != nullptr, d_scale != nullptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardGradientAll< + T, kBlockDim, true><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, + batch_size, feature_size)); + } + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<>>( + x, d_x, mean, var, epsilon, feature_size)); + } + break; + default: + break; + } +} + +template +class LayerNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto *x = ctx.Input("X"); + + auto *y = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + const auto x_dims = x->dims(); + auto *x_data = x->data(); + auto *y_data = y->mutable_data(ctx.GetPlace()); + auto *mean_data = mean->mutable_data(ctx.GetPlace()); + auto *var_data = var->mutable_data(ctx.GetPlace()); + auto *scale_data = (scale == nullptr ? nullptr : scale->data()); + auto *bias_data = (bias == nullptr ? nullptr : bias->data()); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int batch_size = static_cast(matrix_dim[0]); + int feature_size = static_cast(matrix_dim[1]); + + auto stream = ctx.cuda_device_context().stream(); + + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormForward<<>>( + x_data, scale_data, bias_data, y_data, mean_data, var_data, + epsilon, feature_size)); + default: + PADDLE_THROW( + "Product from begin_norm_axis to end must be larger than 1"); + break; + } + } +}; + +template +class LayerNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + // d_x, d_scale, d_bias may be nullptr + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + auto *x = ctx.Input("X"); + auto *mean = ctx.Input("Mean"); + auto *var = ctx.Input("Variance"); + auto *scale = ctx.Input("Scale"); + auto *d_y = ctx.Input(framework::GradVarName("Y")); + + auto *x_data = x->data(); + auto *d_y_data = d_y->data(); + auto *mean_data = mean->data(); + auto *var_data = var->data(); + auto *scale_data = (scale == nullptr ? nullptr : scale->data()); + auto *d_scale_data = + (d_scale == nullptr ? nullptr + : d_scale->mutable_data(ctx.GetPlace())); + auto *d_bias_data = + (d_bias == nullptr ? nullptr : d_bias->mutable_data(ctx.GetPlace())); + auto *d_x_data = + (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); + + const auto &x_dims = x->dims(); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int batch_size = static_cast(matrix_dim[0]); + int feature_size = static_cast(matrix_dim[1]); + + auto stream = ctx.cuda_device_context().stream(); + + LayerNormBackward(x_data, d_y_data, scale_data, mean_data, var_data, + d_x_data, d_scale_data, d_bias_data, epsilon, + batch_size, feature_size, stream); + } +}; + +#undef FIXED_BLOCK_DIM_CASE_BASE +#undef FIXED_BLOCK_DIM_CASE +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( layer_norm, diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index 1205bd0587f32caae04c27ecea581fc17988507f..cf1eeb017d666f605a431aa54637d8cbc99c7c46 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -31,16 +31,12 @@ class SoftmaxKernel : public framework::OpKernel { // allocate memory on device. Out->mutable_data(context.GetPlace()); - auto dims = X->dims(); - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - framework::LoDTensor flattened_x; - framework::LoDTensor flattened_out; - flattened_x.ShareDataWith(*X).Resize(flattened_dims); - flattened_out.ShareDataWith(*Out).Resize(flattened_dims); + int rank = X->dims().size(); + Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1); + Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); math::SoftmaxFunctor()( - context.template device_context(), &flattened_x, - &flattened_out); + context.template device_context(), &X_2d, &Out_2d); } }; @@ -55,18 +51,14 @@ class SoftmaxGradKernel : public framework::OpKernel { // allocate memory on device. dX->mutable_data(context.GetPlace()); - auto dims = Out->dims(); - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - framework::LoDTensor flattened_out; - framework::LoDTensor flattened_d_out; - framework::LoDTensor flattened_d_x; - flattened_out.ShareDataWith(*Out).Resize(flattened_dims); - flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims); - flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims); + int rank = Out->dims().size(); + Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); + Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1); + Tensor dX_2d = framework::ReshapeToMatrix(*dX, rank - 1); math::SoftmaxGradFunctor()( - context.template device_context(), &flattened_out, - &flattened_d_out, &flattened_d_x); + context.template device_context(), &Out_2d, &dOut_2d, + &dX_2d); } }; diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index d0286719b9ea1aa671294f519051ac1e269c4e93..652a6ec7a4e2e823b28f39b449570cd375e88e18 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -270,12 +270,13 @@ struct EventItem { double min_time; double max_time; double ave_time; + float ratio; }; // Print results void PrintProfiler(const std::vector>& events_table, const std::string& sorted_domain, const size_t name_width, - const size_t data_width) { + const size_t data_width, double total) { // Output header information std::cout << "\n------------------------->" << " Profiling Report " @@ -300,7 +301,8 @@ void PrintProfiler(const std::vector>& events_table, std::cout << std::setw(name_width) << "Event" << std::setw(data_width) << "Calls" << std::setw(data_width) << "Total" << std::setw(data_width) << "Min." << std::setw(data_width) - << "Max." << std::setw(data_width) << "Ave." << std::endl; + << "Max." << std::setw(data_width) << "Ave." + << std::setw(data_width) << "Ratio." << std::endl; for (size_t i = 0; i < events_table.size(); ++i) { for (size_t j = 0; j < events_table[i].size(); ++j) { const EventItem& event_item = events_table[i][j]; @@ -309,7 +311,9 @@ void PrintProfiler(const std::vector>& events_table, << std::setw(data_width) << event_item.total_time << std::setw(data_width) << event_item.min_time << std::setw(data_width) << event_item.max_time - << std::setw(data_width) << event_item.ave_time << std::endl; + << std::setw(data_width) << event_item.ave_time + << std::setw(data_width) << event_item.total_time / total + << std::endl; } } std::cout << std::endl; @@ -359,6 +363,7 @@ void ParseEvents(const std::vector>& events, std::vector> events_table; size_t max_name_width = 0; + double total = 0.; // the total time for (size_t i = 0; i < events.size(); i++) { std::list pushed_events; std::vector event_items; @@ -379,6 +384,7 @@ void ParseEvents(const std::vector>& events, g_state == ProfilerState::kAll) ? rit->CudaElapsedMs(events[i][j]) : rit->CpuElapsedMs(events[i][j]); + total += event_time; std::string event_name = "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); @@ -387,7 +393,8 @@ void ParseEvents(const std::vector>& events, if (event_idx.find(event_name) == event_idx.end()) { event_idx[event_name] = event_items.size(); EventItem event_item = {event_name, 1, event_time, - event_time, event_time, event_time}; + event_time, event_time, event_time, + 0.}; event_items.push_back(event_item); } else { int index = event_idx[event_name]; @@ -431,7 +438,7 @@ void ParseEvents(const std::vector>& events, } // Print report - PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12); + PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, total); } void DisableProfiler(EventSortingKey sorted_key, diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 2199f5311fd3728e624fc222a1b876eb947cc0aa..be623703c2480774bb04a6bc0c5b00b699d7bb16 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) { std::string ser(seriralized); self.SetAttr(name, ser); }) - .def("block_attr", &pd::OpDesc::GetBlockAttr) + .def("block_attr_id", &pd::OpDesc::GetBlockAttrId) + .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) .def("check_attrs", &pd::OpDesc::CheckAttrs) .def("infer_shape", &pd::OpDesc::InferShape) .def("infer_var_type", &pd::OpDesc::InferVarType) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2320f3e4dbc5d98698670fee19ed983411a802a9..7127bb38f6ddf8a55c1741d1f0ef18c8d9067fba 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -664,7 +664,7 @@ All parameter, weight, gradient are variables in Paddle. const std::string &, Scope *, std::vector &, const ExecutionStrategy &, const BuildStrategy &, size_t, size_t>()) - .def("bcast_params", &ParallelExecutor::BCastParamsToDevices) + .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py index c9a257ba3f86aac79bfe891d78fdaf36d2df8a50..7504474591fa486428d0310f10387818c4cf0300 100644 --- a/python/paddle/dataset/wmt14.py +++ b/python/paddle/dataset/wmt14.py @@ -36,8 +36,7 @@ URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/' MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' # this is a small set of data for test. The original data is too large and # will be add later. -URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' - 'wmt_shrinked_data/wmt14.tgz') +URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz') MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' # BLEU of this trained model is 26.92 URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz' diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index cccb4abe6cf14ac3e90ba59d970c9398b09b7db2..1ae05dec8de6b9cbc9568f0f4d437833be520f8d 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -48,8 +48,6 @@ from .data_feeder import DataFeeder from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from .transpiler import DistributeTranspiler, InferenceTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig -from .concurrency import (Go, make_channel, channel_send, channel_recv, - channel_close, Select) from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from . import clip from . import profiler @@ -61,7 +59,7 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor -__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ +__all__ = framework.__all__ + executor.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ parallel_executor.__all__ + lod_tensor.__all__ + [ 'io', diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 6b739745119836a80518d7bc6138b37101e74431..fd6a76dd0cfa347328d87093884e5cd324395497 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -344,7 +344,7 @@ def _append_backward_ops_(block, grad_sub_block_list = [] # If the op has its own sub-block, deal with the sub-block first if op.has_attr("sub_block"): - sub_block = program.block(op.block_attr("sub_block")) + sub_block = program.block(op.block_attr_id("sub_block")) grad_sub_block = program.create_block() grad_sub_block._set_forward_block_idx(sub_block.idx) cb = _callback_lookup_(op) @@ -406,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): for op_idx in range(start_op_idx, block.desc.op_size()): op_desc = block.desc.op(op_idx) if op_desc.has_attr("sub_block"): - sub_block = block.program.block(op_desc.block_attr("sub_block")) + sub_block = block.program.block(op_desc.block_attr_id("sub_block")) _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) new_vars = set() # create new gradient variables diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py index a8c4d66720d6eda857e5960d86fc3b8ec8f11ade..676a52a917dd1f9700ec38de32932938ec339be5 100644 --- a/python/paddle/fluid/concurrency.py +++ b/python/paddle/fluid/concurrency.py @@ -19,8 +19,7 @@ from .layers import fill_constant from . import core __all__ = [ - 'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close', - 'Select' + 'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select' ] @@ -35,10 +34,10 @@ class Go(BlockGuard): def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is not None: return False - self.construct_go_op() + self._construct_go_op() return super(Go, self).__exit__(exc_type, exc_val, exc_tb) - def construct_go_op(self): + def _construct_go_op(self): main_program = self.helper.main_program go_block = main_program.current_block() parent_block = main_program.block(main_program.current_block() diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 35da1d06a2c1da8ba663ea0f0b9a0e58ea7c4470..e24b9faae24084ccc743a5b5126db9667089e128 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -18,9 +18,7 @@ import six from .framework import Program, default_main_program, Variable from . import core -__all__ = [ - 'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var' -] +__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope'] g_scope = core.Scope() @@ -171,7 +169,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): return fetch_count > 0 -def fetch_var(name, scope=None, return_numpy=True): +def _fetch_var(name, scope=None, return_numpy=True): """ Fetch the value of the variable with the given name from the given scope. @@ -222,6 +220,37 @@ def _get_program_cache_key(feed, fetch_list): return str(feed_var_names + fetch_var_names) +def _as_lodtensor(data, place): + """ + Convert numpy.ndarray to Tensor, its only support Tensor without LoD information. + For higher dimensional sequence data, please use LoDTensor directly. + + Examples: + >>> import paddle.fluid as fluid + >>> place = fluid.CPUPlace() + >>> exe = fluid.executor(place) + >>> data = np.array(size=(100, 200, 300)) + >>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data) + >>> ... + + Args: + data(numpy.ndarray): a instance of array + + Returns: + LoDTensor + """ + if isinstance(data, list): + raise RuntimeError("Some of your feed data hold LoD information. \ + They can not be completely cast from a list of Python \ + ndarray to LoDTensor. Please convert data to LoDTensor \ + directly before feeding the data.\ + ") + # single tensor case + tensor = core.LoDTensor() + tensor.set(data, place) + return tensor + + class Executor(object): """ An Executor in Python, only support the single-GPU running. For multi-cards, please refer to @@ -250,35 +279,6 @@ class Executor(object): self.program_caches = dict() self._closed = False - def as_lodtensor(self, data): - """ - Convert numpy.ndarray to Tensor, its only support Tensor without LoD information. - For higher dimensional sequence data, please use LoDTensor directly. - - Examples: - >>> import paddle.fluid as fluid - >>> exe = fluid.executor(fluid.CPUPlace()) - >>> data = np.array(size=(100, 200, 300)) - >>> np_outs = map(lambda x: exe.as_lodtensor(x), data) - >>> ... - - Args: - data(numpy.ndarray): a instance of array - - Returns: - LoDTensor - """ - if isinstance(data, list): - raise RuntimeError("Some of your feed data hold LoD information. \ - They can not be completely cast from a list of Python \ - ndarray to LoDTensor. Please convert data to LoDTensor \ - directly before feeding the data.\ - ") - # single tensor case - tensor = core.LoDTensor() - tensor.set(data, self.place) - return tensor - def _get_program_cache(self, program_cache_key): return self.program_caches.get(program_cache_key, None) @@ -337,7 +337,7 @@ class Executor(object): feed_target_name = op.desc.output('Out')[0] cur_feed = feed[feed_target_name] if not isinstance(cur_feed, core.LoDTensor): - cur_feed = self.as_lodtensor(cur_feed) + cur_feed = _as_lodtensor(cur_feed, self.place) idx = op.desc.attr('col') core.set_feed_variable(scope, cur_feed, feed_var_name, idx) else: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 18ee6b0c92a81d89ced9e81db440e9da507b7a17..2e6b45eb7ac03923da4d8485e736925691b0b580 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -476,23 +476,25 @@ class Operator(object): attrs=None): self.block = block self.desc = desc - self.attrs = attrs - if self.attrs is None: - self.attrs = dict() + # note: not add self.attrs here: + # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173 + op_attrs = attrs + if op_attrs is None: + op_attrs = dict() del attrs op_maker = core.op_proto_and_checker_maker - if op_maker.kOpRoleAttrName() not in self.attrs: - self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role + if op_maker.kOpRoleAttrName() not in op_attrs: + op_attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role role_var_name = op_maker.kOpRoleVarAttrName() if len(self.block.program. - op_role_var) != 0 and role_var_name not in self.attrs: - self.attrs[role_var_name] = self.block.program.op_role_var + op_role_var) != 0 and role_var_name not in op_attrs: + op_attrs[role_var_name] = self.block.program.op_role_var - if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0: - del self.attrs[role_var_name] + if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: + del op_attrs[role_var_name] if len(self.desc.type()) != 0: return @@ -576,15 +578,14 @@ class Operator(object): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) - if self.attrs is not None: - if not isinstance(self.attrs, dict): + if op_attrs is not None: + if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") for attr in proto.attrs: attr_name = attr.name - if (attr_name not in self.attrs) or ( - self.attrs[attr_name] is None): + if (attr_name not in op_attrs) or (op_attrs[attr_name] is None): continue - attr_val = self.attrs[attr_name] + attr_val = op_attrs[attr_name] self._update_desc_attr(attr_name, attr_val) self.desc.check_attrs() @@ -732,7 +733,6 @@ class Operator(object): Raises: ValueError: If the type of value doesn't match with desc.attr_type(name). """ - self.attrs[name] = val self._update_desc_attr(name, val) def _update_desc_attr(self, name, val): @@ -774,9 +774,9 @@ class Operator(object): """ return self.desc.attr(name) - def block_attr(self, name): + def block_attr_id(self, name): """ - Get the block attribute by name. + Get the block attribute's id by name. Args: name(str): the attribute name. @@ -784,22 +784,74 @@ class Operator(object): Returns: int: the block index. """ - return self.desc.block_attr(name) + return self.desc.block_attr_id(name) + + def block_attr(self, name): + """ + Get the block attribute by name. + + Args: + name(str): the attribute name. + + Returns: + block: the block attribute. + """ + + id = self.block_attr_id(name) + assert (id >= 0 and id < len(self.block.program.blocks)) + return self.block.program.blocks[id] + + def blocks_attr(self, name): + """ + Get the blocks attribute by name. + + Args: + name(str): the attribute name. + + Returns: + list: list of the blocks attribute. + """ + attrs = [] + for i in self.blocks_attr_ids(name): + assert (i >= 0 and i < len(self.block.program.blocks)) + attrs.append(self.block.program.blocks[i]) + + return attrs + + def blocks_attr_ids(self, name): + """ + Get the blocks attribute's ids by name. + + Args: + name(str): the attribute name. + + Returns: + list: list of the blocks ids. + """ + + return self.desc.blocks_attr_ids(name) def all_attrs(self): """ Get the attribute dict. Returns: - dict: The Operator's attribute dict. + dict: The Operator's attribute dict, name->attr. """ attr_names = self.attr_names attr_map = {} for n in attr_names: - if n == 'sub_block': + attr_type = self.desc.attr_type(n) + if attr_type == core.AttrType.BLOCK: attr_map[n] = self.block_attr(n) - else: - attr_map[n] = self.attr(n) + continue + + if attr_type == core.AttrType.BLOCKS: + attr_map[n] = self.blocks_attr(n) + continue + + attr_map[n] = self.attr(n) + return attr_map @@ -1525,11 +1577,17 @@ class Program(object): The two code snippets above will generate same programs. """ if for_test: - p = self.inference_optimize() + p = self.inference_optimize(export_for_deployment=False) else: p = Program() + p.current_block_idx = self.current_block_idx + p._seed = self._seed p.desc = core.ProgramDesc(self.desc) - p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())] + p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())] + + p._current_role = self._current_role + p._op_role_var = self._op_role_var + p._sync_with_cpp() p._copy_param_info_from(self) @@ -1585,7 +1643,7 @@ class Program(object): res._sync_with_cpp() return res - def inference_optimize(self): + def inference_optimize(self, export_for_deployment=True): """ This method will create a new program and do following adjustments on it: 1. Remove all reader variables and their creator ops if exist. @@ -1596,6 +1654,10 @@ class Program(object): attribute of operators to :code:`True`. All the :code:`Parameter` information will be lost. + Args: + export_for_deployment(bool): remove the read ops that are added by py_reader + for cpp inference library + Notes: This API is a very low level API. Use :code:`Program.clone(for_test=True)` instead. @@ -1610,16 +1672,17 @@ class Program(object): # remove all readers and the read_op if exist read_op_idx = 0 root_block = res.desc.block(0) - while True: - if read_op_idx >= root_block.op_size() or root_block.op( - read_op_idx).type() == 'read': - break - read_op_idx += 1 - if read_op_idx < root_block.op_size(): - root_block._remove_op(0, read_op_idx + 1) - for var in root_block.all_vars(): - if var.type() == core.VarDesc.VarType.READER: - root_block._remove_var(var.name()) + if export_for_deployment: + while True: + if read_op_idx >= root_block.op_size() or root_block.op( + read_op_idx).type() == 'read': + break + read_op_idx += 1 + if read_op_idx < root_block.op_size(): + root_block._remove_op(0, read_op_idx + 1) + for var in root_block.all_vars(): + if var.type() == core.VarDesc.VarType.READER: + root_block._remove_var(var.name()) # change all `is_test` attributes to True for i in range(res.desc.num_blocks()): diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 83290ac60839b855a0348696ad6898af7335e2fc..3f740dd7c539973c5bb2b260120729e9563eedd2 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -264,7 +264,8 @@ class NormalInitializer(Initializer): "dtype": int(var.dtype), "mean": self._mean, "std": self._std_dev, - "seed": self._seed + "seed": self._seed, + "use_mkldnn": False }) var.op = op return op diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 8665925286250eab8188b216ddeef8764da85f2e..f4a1fa78b8956b2d555b87e64feb75fd692fa433 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -559,7 +559,8 @@ def save_inference_model(dirname, executor, main_program=None, model_filename=None, - params_filename=None): + params_filename=None, + export_for_deployment=True): """ Prune the given `main_program` to build a new program especially for inference, and then save it and all related parameters to given `dirname` by the `executor`. @@ -581,6 +582,8 @@ def save_inference_model(dirname, params_filename(str|None): The name of file to save all related parameters. If it is setted None, parameters will be saved in separate files . + export_for_deployment(bool): remove the read ops that are added by py_reader + for cpp inference lib. Default True Returns: None @@ -647,7 +650,8 @@ def save_inference_model(dirname, copy_program.desc.flush() pruned_program = copy_program.prune(targets=target_vars) - inference_program = pruned_program.inference_optimize() + inference_program = pruned_program.inference_optimize( + export_for_deployment=export_for_deployment) fetch_var_names = [v.name for v in target_vars] prepend_feed_ops(inference_program, feeded_var_names) diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 9fae96d9bc7f5e6e89f8ee3894d28ae306e10ca9..0800c02d9ec7dc51d4f22d3b04f3ae90315ebad8 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -164,7 +164,7 @@ def rpn_target_assign(loc, }) # 4. Reshape and gather the target entry - scores = nn.reshape(x=scores, shape=(-1, 1)) + scores = nn.reshape(x=scores, shape=(-1, 2)) loc = nn.reshape(x=loc, shape=(-1, 4)) target_label = nn.reshape(x=target_label, shape=(-1, 1)) target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4)) @@ -722,7 +722,7 @@ def ssd_loss(location, }, attrs={ 'neg_pos_ratio': neg_pos_ratio, - 'neg_dist_threshold': neg_pos_ratio, + 'neg_dist_threshold': neg_overlap, 'mining_type': mining_type, 'sample_size': sample_size, }) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index eabe6bb901ad1b7afe71717eb3f5f5765f261201..2a3555ebdde4d54f63bb420218896560c1b40ffd 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -273,19 +273,19 @@ class ParallelExecutor(object): arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() if self.is_dist: - self.bcast_params() + self._bcast_params() if return_numpy: return executor.as_numpy(arr) return [arr[i] for i in range(len(arr))] - def bcast_params(self): + def _bcast_params(self): """ Broadcast the parameters to other devices. It is used during distributed training. """ - self.executor.bcast_params(set(self.persistable_vars)) + self.executor._bcast_params(set(self.persistable_vars)) @property def device_count(self): diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..8f5ba33f7cbf5286edc4503c219fd3cdff60c517 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -0,0 +1,103 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +from functools import reduce +from test_dist_base import TestDistRunnerBase, runtime_main + +DTYPE = "float32" +paddle.dataset.mnist.fetch() + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + + +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu", + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant())) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu", + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant())) + + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale, seed=1))) + return predict + + +class TestDistMnist2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + # Optimization + opt = fluid.optimizer.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + opt.minimize(avg_cost) + return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict + + +if __name__ == "__main__": + runtime_main(TestDistMnist2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index 62ca67ae4c041c0fe6c0836c79ebe6c23248e43e..d576a173ce2546119ede49128ef69d240c7cf482 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -27,6 +27,7 @@ from multiprocessing import Process import os import sys import signal +from test_dist_base import TestDistRunnerBase, runtime_main # Fix seed for test fluid.default_startup_program().random_seed = 1 @@ -196,161 +197,52 @@ class SE_ResNeXt(): return scale -def get_model(batch_size): - # Input data - image = fluid.layers.data(name="data", shape=[3, 224, 224], dtype='float32') - label = fluid.layers.data(name="int64", shape=[1], dtype='int64') +class DistSeResneXt2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + # Input data + image = fluid.layers.data( + name="data", shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data(name="int64", shape=[1], dtype='int64') - # Train program - model = SE_ResNeXt(layers=50) - out = model.net(input=image, class_dim=102) - cost = fluid.layers.cross_entropy(input=out, label=label) + # Train program + model = SE_ResNeXt(layers=50) + out = model.net(input=image, class_dim=102) + cost = fluid.layers.cross_entropy(input=out, label=label) - avg_cost = fluid.layers.mean(x=cost) - acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) - acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + avg_cost = fluid.layers.mean(x=cost) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) - # Evaluator - test_program = fluid.default_main_program().clone(for_test=True) + # Evaluator + test_program = fluid.default_main_program().clone(for_test=True) - # Optimization - total_images = 6149 # flowers - epochs = [30, 60, 90] - step = int(total_images / batch_size + 1) + # Optimization + total_images = 6149 # flowers + epochs = [30, 60, 90] + step = int(total_images / batch_size + 1) - bd = [step * e for e in epochs] - base_lr = 0.1 - lr = [] - lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + bd = [step * e for e in epochs] + base_lr = 0.1 + lr = [] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - optimizer = fluid.optimizer.Momentum( - # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed. - #learning_rate=fluid.layers.piecewise_decay( - # boundaries=bd, values=lr), - learning_rate=base_lr, - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - optimizer.minimize(avg_cost) + optimizer = fluid.optimizer.Momentum( + # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed. + #learning_rate=fluid.layers.piecewise_decay( + # boundaries=bd, values=lr), + learning_rate=base_lr, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + optimizer.minimize(avg_cost) - # Reader - train_reader = paddle.batch( - paddle.dataset.flowers.train(), batch_size=batch_size) - test_reader = paddle.batch( - paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) + # Reader + train_reader = paddle.batch( + paddle.dataset.flowers.train(), batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) - return test_program, avg_cost, train_reader, test_reader, acc_top1, out - - -def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): - t = fluid.DistributeTranspiler() - t.transpile( - trainer_id=trainer_id, - program=main_program, - pservers=pserver_endpoints, - trainers=trainers) - return t - - -class DistSeResneXt2x2: - def run_pserver(self, pserver_endpoints, trainers, current_endpoint, - trainer_id): - get_model(batch_size=2) - t = get_transpiler(trainer_id, - fluid.default_main_program(), pserver_endpoints, - trainers) - pserver_prog = t.get_pserver_program(current_endpoint) - startup_prog = t.get_startup_program(current_endpoint, pserver_prog) - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup_prog) - exe.run(pserver_prog) - - def _wait_ps_ready(self, pid): - retry_times = 20 - while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(3) - print("waiting ps ready: ", pid) - try: - # the listen_and_serv_op would touch a file which contains the listen port - # on the /tmp directory until it was ready to process all the RPC call. - os.stat("/tmp/paddle.%d.port" % pid) - return - except os.error: - retry_times -= 1 - - def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): - test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model( - batch_size=2) - if is_dist: - t = get_transpiler(trainer_id, - fluid.default_main_program(), endpoints, - trainers) - trainer_prog = t.get_trainer_program() - else: - trainer_prog = fluid.default_main_program() - - startup_exe = fluid.Executor(place) - startup_exe.run(fluid.default_startup_program()) - - strategy = fluid.ExecutionStrategy() - strategy.num_threads = 1 - strategy.allow_op_delay = False - exe = fluid.ParallelExecutor( - True, loss_name=avg_cost.name, exec_strategy=strategy) - - feed_var_list = [ - var for var in trainer_prog.global_block().vars.values() - if var.is_data - ] - - feeder = fluid.DataFeeder(feed_var_list, place) - reader_generator = test_reader() - - data = next(reader_generator) - first_loss, = exe.run(fetch_list=[avg_cost.name], - feed=feeder.feed(data)) - print(first_loss) - - for i in six.moves.xrange(5): - data = next(reader_generator) - loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) - - data = next(reader_generator) - last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) - print(last_loss) - - -def main(role="pserver", - endpoints="127.0.0.1:9123", - trainer_id=0, - current_endpoint="127.0.0.1:9123", - trainers=1, - is_dist=True): - model = DistSeResneXt2x2() - if role == "pserver": - model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) - else: - p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( - ) else fluid.CPUPlace() - model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) + return test_program, avg_cost, train_reader, test_reader, acc_top1, out if __name__ == "__main__": - if len(sys.argv) != 7: - print( - "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" - ) - role = sys.argv[1] - endpoints = sys.argv[2] - trainer_id = int(sys.argv[3]) - current_endpoint = sys.argv[4] - trainers = int(sys.argv[5]) - is_dist = True if sys.argv[6] == "TRUE" else False - main( - role=role, - endpoints=endpoints, - trainer_id=trainer_id, - current_endpoint=current_endpoint, - trainers=trainers, - is_dist=is_dist) + runtime_main(DistSeResneXt2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py new file mode 100644 index 0000000000000000000000000000000000000000..54a70f4adb4a9bb24e3c618a7fe71f42a376609b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py @@ -0,0 +1,119 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +from test_dist_base import TestDistRunnerBase, runtime_main + +IS_SPARSE = True +EMBED_SIZE = 32 +HIDDEN_SIZE = 256 +N = 5 + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + + +class TestDistWord2vec2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + BATCH_SIZE = batch_size + + def __network__(words): + embed_first = fluid.layers.embedding( + input=words[0], + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr( + name='shared_w', initializer=fluid.initializer.Constant())) + embed_second = fluid.layers.embedding( + input=words[1], + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr( + name='shared_w', initializer=fluid.initializer.Constant())) + embed_third = fluid.layers.embedding( + input=words[2], + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr( + name='shared_w', initializer=fluid.initializer.Constant())) + embed_forth = fluid.layers.embedding( + input=words[3], + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr( + name='shared_w', initializer=fluid.initializer.Constant())) + + concat_embed = fluid.layers.concat( + input=[embed_first, embed_second, embed_third, embed_forth], + axis=1) + hidden1 = fluid.layers.fc( + input=concat_embed, + size=HIDDEN_SIZE, + act='sigmoid', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant())) + predict_word = fluid.layers.fc( + input=hidden1, + size=dict_size, + act='softmax', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant())) + cost = fluid.layers.cross_entropy( + input=predict_word, label=words[4]) + avg_cost = fluid.layers.mean(cost) + return avg_cost, predict_word + + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + + first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') + second_word = fluid.layers.data( + name='secondw', shape=[1], dtype='int64') + third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') + forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64') + next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') + avg_cost, predict_word = __network__( + [first_word, second_word, third_word, forth_word, next_word]) + + inference_program = paddle.fluid.default_main_program().clone() + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_cost) + + train_reader = paddle.batch( + paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) + test_reader = paddle.batch( + paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE) + + return inference_program, avg_cost, train_reader, test_reader, None, predict_word + + +if __name__ == "__main__": + runtime_main(TestDistWord2vec2x2) diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py index c5b9e92d69133e593a2ce223e83006eda590daa5..86ac159323a5f9f6149ce5ed4437402eb885c6bc 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py @@ -105,5 +105,107 @@ class TestCrossEntropyOp3(OpTest): ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001) +class TestCrossEntropyOp4(OpTest): + """Test high rank tensor cross-entropy with discrete one-hot labels. + """ + + def setUp(self): + self.op_type = "cross_entropy" + shape = [10, 2, 4] + ins_num = np.prod(np.array(shape)) + class_num = 10 + + X_2d = randomize_probability(ins_num, class_num, dtype='float64') + + label_2d = np.random.randint(0, class_num, (ins_num, 1), dtype="int64") + cross_entropy_2d = np.asmatrix( + [[-np.log(X_2d[i][label_2d[i][0]])] for i in range(X_2d.shape[0])], + dtype="float64") + + X = X_2d.reshape(shape + [class_num]) + label = label_2d.reshape(shape + [1]) + cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1]) + + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": False} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Y", numeric_grad_delta=0.001) + + +class TestCrossEntropyOp5(OpTest): + """Test high rank tensor cross-entropy with vectorized soft labels. + """ + + def setUp(self): + self.op_type = "cross_entropy" + shape = [4, 3] + ins_num = np.prod(np.array(shape)) + class_num = 37 + + X_2d = randomize_probability(ins_num, class_num) + label_2d = np.random.uniform(0.1, 1.0, + [ins_num, class_num]).astype("float32") + label_2d /= label_2d.sum(axis=1, keepdims=True) + cross_entropy_2d = (-label_2d * np.log(X_2d)).sum( + axis=1, keepdims=True).astype("float32") + + X = X_2d.reshape(shape + [class_num]) + label = label_2d.reshape(shape + [class_num]) + cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1]) + + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": True} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad( + ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001) + + +class TestCrossEntropyOp6(OpTest): + """Test high rank tensor cross-entropy with vectorized one-hot representation of labels. + """ + + def setUp(self): + self.op_type = "cross_entropy" + shape = [4, 3, 2] + ins_num = np.prod(np.array(shape)) + class_num = 17 + + X_2d = randomize_probability(ins_num, class_num) + label_index_2d = np.random.randint( + 0, class_num, (ins_num), dtype="int32") + label_2d = np.zeros(X_2d.shape) + label_2d[np.arange(ins_num), label_index_2d] = 1 + + cross_entropy_2d = np.asmatrix( + [[-np.log(X_2d[i][label_index_2d[i]])] + for i in range(X_2d.shape[0])], + dtype="float32") + + X = X_2d.reshape(shape + [class_num]) + label = label_2d.reshape(shape + [class_num]) + cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1]) + + self.inputs = {"X": X, "Label": label.astype(np.float32)} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": True} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad( + ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py new file mode 100644 index 0000000000000000000000000000000000000000..8603d3a5b3b5d368fe87b8dcf9dc7363f95caf86 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py @@ -0,0 +1,196 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +import collections + +SEED = 1 +DTYPE = "float32" +paddle.dataset.mnist.fetch() + + +# random seed must set before configuring the network. +# fluid.default_startup_program().random_seed = SEED +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + # TODO(dzhwinter) : refine the initializer and random seed settting + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + return predict + + +def get_model(batch_size): + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + # Optimization + opt = fluid.optimizer.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + opt.minimize(avg_cost) + return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict + + +def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id=trainer_id, + program=main_program, + pservers=pserver_endpoints, + trainers=trainers) + return t + + +def operator_equal(a, b): + for k, v in a.__dict__.iteritems(): + if isinstance(v, fluid.framework.Program) or \ + isinstance(v, fluid.framework.Block): + continue + + elif isinstance(v, core.OpDesc): + if v.serialize_to_string() != b.__dict__[k].serialize_to_string(): + raise ValueError("In operator_equal not equal:{0}\n".format(k)) + + elif isinstance(v, collections.OrderedDict): + v0 = sorted(v.iteritems(), key=lambda x: x[0]) + v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0]) + + if v0 != v1: + raise ValueError("In operator_equal not equal:{0}\n".format(k)) + + elif (v != b.__dict__[k]): + raise ValueError("In operator_equal not equal:{0}\n".format(k)) + + return True + + +def block_equal(a, b): + for k, v in a.__dict__.iteritems(): + if isinstance(v, core.ProgramDesc) or isinstance( + v, fluid.framework.Program) or isinstance(v, core.BlockDesc): + continue + + elif k == "ops": + for i in range(0, len(a.ops)): + if not operator_equal(a.ops[i], b.ops[i]): + raise ValueError("In block_equal not equal:{0}\n".format(k)) + assert (len(a.ops) == len(b.ops)) + + elif isinstance(v, collections.OrderedDict): + v0 = sorted(v.iteritems(), key=lambda x: x[0]) + v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0]) + + if v0 != v1: + raise ValueError("In block_equal not equal:{0}\n".format(k)) + + elif (v != b.__dict__[k]): + raise ValueError("In block_equal not equal:{0}\n".format(k)) + + return True + + +def program_equal(a, b): + for k, v in a.__dict__.iteritems(): + if isinstance(v, core.ProgramDesc): + continue + + elif k == 'blocks': + for i in range(0, len(a.blocks)): + if not block_equal(a.blocks[i], b.blocks[i]): + raise ValueError("In operator_equal not equal:{0}\n".format( + k)) + return False + assert (len(a.blocks) == len(b.blocks)) + + elif (v != b.__dict__[k]): + raise ValueError("In program_equal not equal:{0}\n".format(k)) + + return True + + +class TestDistMnist(unittest.TestCase): + def test_desc_clone(self): + get_model(batch_size=20) + + pserver_endpoints = "127.0.0.1:9123" + trainers = 1 + current_endpoint = "127.0.0.1:9123" + t = get_transpiler(0, + fluid.default_main_program(), pserver_endpoints, + trainers) + + pserver_prog = t.get_pserver_program(current_endpoint) + startup_prog = t.get_startup_program(current_endpoint, pserver_prog) + main = pserver_prog.clone() + startup = startup_prog.clone() + + self.assertTrue(program_equal(main, pserver_prog)) + self.assertTrue(program_equal(startup, startup_prog)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 58cfd4e1fd958d8d59e49c87fbbabd0182975add..4379463aca4443eb7a886ce78446440cc59f3b30 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -18,6 +18,109 @@ import os import sys import signal import subprocess +import six + + +class TestDistRunnerBase(object): + def get_model(self, batch_size=2): + raise NotImplementedError( + "get_model should be implemented by child classes.") + + def get_transpiler(self, trainer_id, main_program, pserver_endpoints, + trainers): + # NOTE: import fluid until runtime, or else forking processes will cause error. + import paddle + import paddle.fluid as fluid + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id=trainer_id, + program=main_program, + pservers=pserver_endpoints, + trainers=trainers) + return t + + def run_pserver(self, pserver_endpoints, trainers, current_endpoint, + trainer_id): + import paddle + import paddle.fluid as fluid + self.get_model(batch_size=2) + t = self.get_transpiler(trainer_id, + fluid.default_main_program(), pserver_endpoints, + trainers) + pserver_prog = t.get_pserver_program(current_endpoint) + startup_prog = t.get_startup_program(current_endpoint, pserver_prog) + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + exe.run(pserver_prog) + + def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): + import paddle + import paddle.fluid as fluid + test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ + self.get_model(batch_size=2) + if is_dist: + t = self.get_transpiler(trainer_id, + fluid.default_main_program(), endpoints, + trainers) + trainer_prog = t.get_trainer_program() + else: + trainer_prog = fluid.default_main_program() + + startup_exe = fluid.Executor(place) + startup_exe.run(fluid.default_startup_program()) + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + exe = fluid.ParallelExecutor( + True, loss_name=avg_cost.name, exec_strategy=strategy) + + feed_var_list = [ + var for var in trainer_prog.global_block().vars.values() + if var.is_data + ] + + feeder = fluid.DataFeeder(feed_var_list, place) + reader_generator = test_reader() + + data = next(reader_generator) + first_loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(data)) + print(first_loss) + + for i in six.moves.xrange(5): + data = next(reader_generator) + loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) + + data = next(reader_generator) + last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) + print(last_loss) + + +def runtime_main(test_class): + import paddle + import paddle.fluid as fluid + import paddle.fluid.core as core + + if len(sys.argv) != 7: + print( + "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" + ) + role = sys.argv[1] + endpoints = sys.argv[2] + trainer_id = int(sys.argv[3]) + current_endpoint = sys.argv[4] + trainers = int(sys.argv[5]) + is_dist = True if sys.argv[6] == "TRUE" else False + + model = test_class() + if role == "pserver": + model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) + else: + p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) class TestDistBase(unittest.TestCase): @@ -27,7 +130,7 @@ class TestDistBase(unittest.TestCase): self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._python_interp = "python" - def start_pserver(self, model_file): + def start_pserver(self, model_file, check_error_log): ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ (self._python_interp, model_file, self._ps_endpoints, ps0_ep, @@ -36,11 +139,23 @@ class TestDistBase(unittest.TestCase): (self._python_interp, model_file, self._ps_endpoints, ps1_ep, self._trainers) + ps0_pipe = subprocess.PIPE + ps1_pipe = subprocess.PIPE + if check_error_log: + print("ps0_cmd:", ps0_cmd) + print("ps1_cmd:", ps1_cmd) + ps0_pipe = open("/tmp/ps0_err.log", "wb") + ps1_pipe = open("/tmp/ps1_err.log", "wb") + ps0_proc = subprocess.Popen( - ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe) ps1_proc = subprocess.Popen( - ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return ps0_proc, ps1_proc + ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe) + + if not check_error_log: + return ps0_proc, ps1_proc, None, None + else: + return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe def _wait_ps_ready(self, pid): retry_times = 50 @@ -57,7 +172,7 @@ class TestDistBase(unittest.TestCase): (e, retry_times)) retry_times -= 1 - def check_with_place(self, model_file, delta=1e-3): + def check_with_place(self, model_file, delta=1e-3, check_error_log=False): # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN required_envs = { "PATH": os.getenv("PATH"), @@ -66,17 +181,32 @@ class TestDistBase(unittest.TestCase): "FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_cudnn_deterministic": "1" } + + if check_error_log: + required_envs["GLOG_v"] = "7" + required_envs["GLOG_logtostderr"] = "1" + # Run local to get a base line env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local.update(required_envs) local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \ (self._python_interp, model_file, "127.0.0.1:1234", "127.0.0.1:1234", 1) - local_proc = subprocess.Popen( - local_cmd.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env_local) + if not check_error_log: + local_proc = subprocess.Popen( + local_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env_local) + else: + print("trainer cmd:", local_cmd) + err_log = open("/tmp/trainer.err.log", "wb") + local_proc = subprocess.Popen( + local_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=err_log, + env=env_local) + local_proc.wait() out, err = local_proc.communicate() local_ret = out @@ -84,7 +214,8 @@ class TestDistBase(unittest.TestCase): sys.stderr.write('local_stderr: %s\n' % err) # Run dist train to compare with local results - ps0, ps1 = self.start_pserver(model_file) + ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file, + check_error_log) self._wait_ps_ready(ps0.pid) self._wait_ps_ready(ps1.pid) @@ -102,15 +233,23 @@ class TestDistBase(unittest.TestCase): env1.update(required_envs) FNULL = open(os.devnull, 'w') + tr0_pipe = subprocess.PIPE + tr1_pipe = subprocess.PIPE + if check_error_log: + print("tr0_cmd:", tr0_cmd) + print("tr1_cmd:", tr1_cmd) + tr0_pipe = open("/tmp/tr0_err.log", "wb") + tr1_pipe = open("/tmp/tr1_err.log", "wb") + tr0_proc = subprocess.Popen( tr0_cmd.split(" "), stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stderr=tr0_pipe, env=env0) tr1_proc = subprocess.Popen( tr1_cmd.split(" "), stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stderr=tr1_pipe, env=env1) tr0_proc.wait() @@ -127,12 +266,17 @@ class TestDistBase(unittest.TestCase): local_first_loss = eval(local_lines[0])[0] local_last_loss = eval(local_lines[1])[0] - self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) - self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta) + # close trainer file + if check_error_log: + tr0_pipe.close() + tr1_pipe.close() - # check tr0_out - # FIXME: ensure the server process is killed - # replace with ps0.terminate() + ps0_pipe.close() + ps1_pipe.close() + # FIXME: use terminate() instead of sigkill. os.kill(ps0.pid, signal.SIGKILL) os.kill(ps1.pid, signal.SIGKILL) FNULL.close() + + self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) + self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta) diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index a6fcbd977f1af9d452dfd8367efef706cd566149..b3ccec9a7d65de57778a1f013465d41a5a267676 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -11,200 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import numpy as np -import argparse -import time -import math - -import paddle -import paddle.fluid as fluid -import paddle.fluid.profiler as profiler -from paddle.fluid import core import unittest -from multiprocessing import Process -import os -import signal -from functools import reduce - -SEED = 1 -DTYPE = "float32" -paddle.dataset.mnist.fetch() - - -# random seed must set before configuring the network. -# fluid.default_startup_program().random_seed = SEED -def cnn_model(data): - conv_pool_1 = fluid.nets.simple_img_conv_pool( - input=data, - filter_size=5, - num_filters=20, - pool_size=2, - pool_stride=2, - act="relu") - conv_pool_2 = fluid.nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=50, - pool_size=2, - pool_stride=2, - act="relu") - - # TODO(dzhwinter) : refine the initializer and random seed settting - SIZE = 10 - input_shape = conv_pool_2.shape - param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] - scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 - - predict = fluid.layers.fc( - input=conv_pool_2, - size=SIZE, - act="softmax", - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale))) - return predict - - -def get_model(batch_size): - # Input data - images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # Train program - predict = cnn_model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - # Evaluator - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) - - inference_program = fluid.default_main_program().clone() - # Optimization - opt = fluid.optimizer.AdamOptimizer( - learning_rate=0.001, beta1=0.9, beta2=0.999) - - # Reader - train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=batch_size) - test_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=batch_size) - opt.minimize(avg_cost) - return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict - - -def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): - t = fluid.DistributeTranspiler() - t.transpile( - trainer_id=trainer_id, - program=main_program, - pservers=pserver_endpoints, - trainers=trainers) - return t - - -def run_pserver(pserver_endpoints, trainers, current_endpoint): - get_model(batch_size=20) - t = get_transpiler(0, - fluid.default_main_program(), pserver_endpoints, - trainers) - pserver_prog = t.get_pserver_program(current_endpoint) - startup_prog = t.get_startup_program(current_endpoint, pserver_prog) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup_prog) - - exe.run(pserver_prog) - - -class TestDistMnist(unittest.TestCase): - def setUp(self): - self._trainers = 1 - self._pservers = 1 - self._ps_endpoints = "127.0.0.1:9123" - - def start_pserver(self, endpoint): - p = Process( - target=run_pserver, - args=(self._ps_endpoints, self._trainers, endpoint)) - p.start() - return p.pid - - def _wait_ps_ready(self, pid): - retry_times = 5 - while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(1) - try: - # the listen_and_serv_op would touch a file which contains the listen port - # on the /tmp directory until it was ready to process all the RPC call. - os.stat("/tmp/paddle.%d.port" % pid) - return - except os.error: - retry_times -= 1 - - def stop_pserver(self, pid): - os.kill(pid, signal.SIGTERM) - - def test_with_place(self): - p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( - ) else fluid.CPUPlace() - - pserver_pid = self.start_pserver(self._ps_endpoints) - self._wait_ps_ready(pserver_pid) - - self.run_trainer(p, 0) - - self.stop_pserver(pserver_pid) - - def run_trainer(self, place, trainer_id): - test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model( - batch_size=20) - t = get_transpiler(trainer_id, - fluid.default_main_program(), self._ps_endpoints, - self._trainers) - - trainer_prog = t.get_trainer_program() - - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - feed_var_list = [ - var for var in trainer_prog.global_block().vars.values() - if var.is_data - ] +from test_dist_base import TestDistBase - feeder = fluid.DataFeeder(feed_var_list, place) - for pass_id in range(10): - for batch_id, data in enumerate(train_reader()): - exe.run(trainer_prog, feed=feeder.feed(data)) - if (batch_id + 1) % 10 == 0: - acc_set = [] - avg_loss_set = [] - for test_data in test_reader(): - acc_np, avg_loss_np = exe.run( - program=test_program, - feed=feeder.feed(test_data), - fetch_list=[batch_acc, avg_cost]) - acc_set.append(float(acc_np)) - avg_loss_set.append(float(avg_loss_np)) - # get test acc and loss - acc_val = np.array(acc_set).mean() - avg_loss_val = np.array(avg_loss_set).mean() - if float(acc_val - ) > 0.8: # Smaller value to increase CI speed - return - else: - print( - 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'. - format(pass_id, batch_id + 1, - float(avg_loss_val), float(acc_val))) - if math.isnan(float(avg_loss_val)): - assert ("got Nan loss, training failed.") +class TestDistSeResneXt2x2(TestDistBase): + def test_se_resnext(self): + self.check_with_place("dist_mnist.py", delta=1e-7) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index f3a5fd6985bab1d04f6e1484534367548f383dfb..a33a338fc11e4301a8ec0eb565686d62b547b7f7 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -17,7 +17,7 @@ from test_dist_base import TestDistBase class TestDistSeResneXt2x2(TestDistBase): def test_se_resnext(self): - self.check_with_place("dist_se_resnext.py") + self.check_with_place("dist_se_resnext.py", delta=1e-7) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 3bc5e6ada5dabfcad38278db6e14d0b115c5050d..0543e6238109bf5bab68a9f08fc34678936e756f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -259,7 +259,7 @@ class TestLRDecayConditional(TranspilerTest): serv_op = pserver.blocks[0].ops[0] sub_blocks = [] optimize_blocks = [] - for b in serv_op.attrs["optimize_blocks"]: + for b in serv_op.all_attrs()["optimize_blocks"]: optimize_blocks.append(b.idx) for b in pserver.blocks: if b.idx not in optimize_blocks: @@ -536,5 +536,35 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) +class TestRMSPropOptimizer(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) + optimizer.minimize(avg_cost) + return + + def transpiler_test_impl(self): + pserver, startup = self.get_pserver(self.pserver1_ep) + pserver2, startup2 = self.get_pserver(self.pserver2_ep) + + self.assertEqual(len(pserver.blocks), 3) + # block1~2: optimize pass + self.assertEqual([op.type for op in pserver.blocks[1].ops], + ["sum", "scale", "rmsprop"]) + # the variable #fc_w will be split into two blocks + fc_w_var = startup.global_block().var("fc_w.block1") + self.assertEqual(fc_w_var.shape, (500, 1000)) + moment_var = startup.global_block().var("momentum_1") + self.assertEqual(moment_var.shape, (500, 1000)) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index 4bb3998f891959f8270dc4b25821f23f1e6195e0..543d0f9dc2c9b8cdcfaaaa14a7a4f197d210d951 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -11,192 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import numpy as np -import argparse -import time -import math -import paddle -import paddle.fluid as fluid -import paddle.fluid.profiler as profiler -from paddle.fluid import core import unittest -from multiprocessing import Process -import os -import signal - -IS_SPARSE = True -EMBED_SIZE = 32 -HIDDEN_SIZE = 256 -N = 5 -BATCH_SIZE = 32 -ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy - - -def get_model(): - def __network__(words): - embed_first = fluid.layers.embedding( - input=words[0], - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr='shared_w') - embed_second = fluid.layers.embedding( - input=words[1], - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr='shared_w') - embed_third = fluid.layers.embedding( - input=words[2], - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr='shared_w') - embed_forth = fluid.layers.embedding( - input=words[3], - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr='shared_w') - - concat_embed = fluid.layers.concat( - input=[embed_first, embed_second, embed_third, embed_forth], axis=1) - hidden1 = fluid.layers.fc(input=concat_embed, - size=HIDDEN_SIZE, - act='sigmoid') - predict_word = fluid.layers.fc(input=hidden1, - size=dict_size, - act='softmax') - cost = fluid.layers.cross_entropy(input=predict_word, label=words[4]) - avg_cost = fluid.layers.mean(cost) - return avg_cost, predict_word - - word_dict = paddle.dataset.imikolov.build_dict() - dict_size = len(word_dict) - - first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') - second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64') - third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') - forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64') - next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') - avg_cost, predict_word = __network__( - [first_word, second_word, third_word, forth_word, next_word]) - - inference_program = paddle.fluid.default_main_program().clone() - - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(avg_cost) - - train_reader = paddle.batch( - paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) - test_reader = paddle.batch( - paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE) - - return inference_program, avg_cost, train_reader, test_reader, predict_word - - -def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): - t = fluid.DistributeTranspiler() - t.transpile( - trainer_id=trainer_id, - program=main_program, - pservers=pserver_endpoints, - trainers=trainers) - return t - - -def run_pserver(pserver_endpoints, trainers, current_endpoint): - get_model() - t = get_transpiler(0, - fluid.default_main_program(), pserver_endpoints, - trainers) - pserver_prog = t.get_pserver_program(current_endpoint) - startup_prog = t.get_startup_program(current_endpoint, pserver_prog) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup_prog) - - exe.run(pserver_prog) - - -class TestDistMnist(unittest.TestCase): - def setUp(self): - self._trainers = 1 - self._pservers = 1 - self._ps_endpoints = "127.0.0.1:9123" - - def start_pserver(self, endpoint): - p = Process( - target=run_pserver, - args=(self._ps_endpoints, self._trainers, endpoint)) - p.start() - return p.pid - - def _wait_ps_ready(self, pid): - retry_times = 5 - while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(1) - try: - # the listen_and_serv_op would touch a file which contains the listen port - # on the /tmp directory until it was ready to process all the RPC call. - os.stat("/tmp/paddle.%d.port" % pid) - return - except os.error: - retry_times -= 1 - - def stop_pserver(self, pid): - os.kill(pid, signal.SIGKILL) - - def test_with_place(self): - p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( - ) else fluid.CPUPlace() - - pserver_pid = self.start_pserver(self._ps_endpoints) - self._wait_ps_ready(pserver_pid) - - self.run_trainer(p, 0) - - self.stop_pserver(pserver_pid) - - def run_trainer(self, place, trainer_id): - test_program, avg_cost, train_reader, test_reader, predict = get_model() - t = get_transpiler(trainer_id, - fluid.default_main_program(), self._ps_endpoints, - self._trainers) - - trainer_prog = t.get_trainer_program() - - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - use_gpu = True if core.is_compiled_with_cuda() else False - - exec_strategy = ExecutionStrategy() - exec_strategy.use_cuda = use_gpu - train_exe = fluid.ParallelExecutor( - use_cuda=use_gpu, - main_program=trainer_prog, - loss_name=avg_cost.name, - exec_strategy=exec_strategy) +from test_dist_base import TestDistBase - feed_var_list = [ - var for var in trainer_prog.global_block().vars.values() - if var.is_data - ] - feeder = fluid.DataFeeder(feed_var_list, place) - for pass_id in range(10): - for batch_id, data in enumerate(train_reader()): - avg_loss_np = train_exe.run(feed=feeder.feed(data), - fetch_list=[avg_cost.name]) - loss = np.array(avg_loss_np).mean() - if float(loss) < 5.0: - return - if math.isnan(loss): - assert ("Got Nan loss, training failed") +class TestDistSeResneXt2x2(TestDistBase): + def test_se_resnext(self): + self.check_with_place("dist_word2vec.py", delta=1e-7) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_fetch_var.py b/python/paddle/fluid/tests/unittests/test_fetch_var.py index 46c3bbb6712c6276e48dd9328d7741a447f28b91..e6f37f0b4ca781e4ec83a00f8f2605ef02716bd7 100644 --- a/python/paddle/fluid/tests/unittests/test_fetch_var.py +++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py @@ -26,7 +26,7 @@ class TestFetchVar(op_test.OpTest): layers.assign(input=val, output=x) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_main_program(), feed={}, fetch_list=[]) - fetched_x = fluid.fetch_var("x") + fetched_x = fluid.executor._fetch_var("x") self.assertTrue( numpy.array_equal(fetched_x, val), "fetch_x=%s val=%s" % (fetched_x, val)) diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py index c51a48239330621d8e008415f81361616467cabf..0997afc97a97333c914a3027103ec48733b410dc 100644 --- a/python/paddle/fluid/tests/unittests/test_program.py +++ b/python/paddle/fluid/tests/unittests/test_program.py @@ -17,6 +17,7 @@ import unittest from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name import paddle.fluid.layers as layers +import paddle.fluid as fluid main_program = default_main_program() @@ -98,6 +99,39 @@ class TestProgram(unittest.TestCase): new_program = main_program.clone() self.assertNotEqual(0, len(new_program.blocks[0].all_parameters())) + def test_program_inference_optimize(self): + def net(): + reader = fluid.layers.py_reader( + capacity=10, + shapes=[[-1, 10], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64'], + use_double_buffer=True) + in_data, label = fluid.layers.read_file(reader) + predict_label = fluid.layers.fc(in_data, size=2, act='softmax') + loss = fluid.layers.mean( + fluid.layers.cross_entropy( + input=predict_label, label=label)) + + optimizer = fluid.optimizer.Adam() + optimizer.minimize(loss) + + startup_program = fluid.Program() + main_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + net() + no_read_program = main_program.inference_optimize() + keep_read_program = main_program.inference_optimize( + export_for_deployment=False) + no_read_ops = no_read_program.global_block().ops + keep_read_ops = keep_read_program.global_block().ops + self.assertEqual(len(keep_read_ops) - len(no_read_ops), 2) + self.assertEqual(keep_read_ops[0].type, 'create_double_buffer_reader') + self.assertEqual(keep_read_ops[1].type, 'read') + + for i in range(len(no_read_ops)): + self.assertEqual(no_read_ops[i].type, keep_read_ops[i + 2].type) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index 621dd681345d2c185c0644de917b8bcd529f9937..9853fb4e9a89944bfdf2954e3d3d86fef92ac93c 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -68,7 +68,7 @@ class TestOpDesc(unittest.TestCase): self.assertEqual(8, len(op.attr_names())) op.set_block_attr("block_attr", program_desc.block(0)) - self.assertEqual(0, op.block_attr("block_attr")) + self.assertEqual(0, op.block_attr_id("block_attr")) mul_op = block.append_op() mul_op.set_type("mul") diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py index 91b1fd2af7d8aaf85d17965f8b02c35ee3990291..f9bda5e4701f693f41fe7041ba0f5ec80b6fc31c 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py @@ -62,7 +62,8 @@ class TestPyReader(unittest.TestCase): next_data = np.random.uniform( low=0, high=1000, size=(batch_size, ) + shape[1:]).astype(dtype) - in_data.append(executor.as_lodtensor(next_data)) + in_data.append( + fluid.executor._as_lodtensor(next_data, place)) self.inputs.append(in_data) diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index 17ab875f6a555c99bbe480a54ea2df3b1f60de2d..868a0248be6833d0e8fed8a26549352562c279c1 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -22,7 +22,7 @@ pos_enc_param_names = ( "src_pos_enc_table", "trg_pos_enc_table", ) -batch_size = 64 +batch_size = 2 def position_encoding_init(n_position, d_pos_vec): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 8832870e871fc83f7594698fb82e1079ca6a22de..9cc57eae9bed4c138cfadaef20c6c1b9039367bd 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -596,12 +596,12 @@ class DistributeTranspiler(object): if op.type in [ "gaussian_random", "fill_constant", "uniform_random" ]: - op.attrs["shape"] = new_outputs["Out"].shape + op.set_attr("shape", list(new_outputs["Out"].shape)) s_prog.global_block().append_op( type=op.type, inputs=new_inputs, outputs=new_outputs, - attrs=op.attrs) + attrs=op.all_attrs()) # add slice vars slice_vars_and_atts = self._get_slice_vars_and_atts(endpoint) @@ -642,7 +642,7 @@ class DistributeTranspiler(object): self.table_name = None for op in self.origin_program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: - if op.attrs['is_distributed'] is True: + if op.attr('is_distributed') is True: if self.table_name is None: self.table_name = op.input("W")[0] if self.table_name != op.input("W")[0]: @@ -788,14 +788,14 @@ class DistributeTranspiler(object): out_name = op.output("Out") ids_var = program.global_block().vars[ids_name[0]] - prefetch_input_vars = self.create_splited_vars( + prefetch_input_vars = self._create_splited_vars( source_var=ids_var, block=program.global_block(), tag="_prefetch_in_") self.all_prefetch_input_vars.append(prefetch_input_vars) out_var = program.global_block().vars[out_name[0]] - prefetch_output_vars = self.create_splited_vars( + prefetch_output_vars = self._create_splited_vars( source_var=out_var, block=program.global_block(), tag="_prefetch_out_") @@ -1077,7 +1077,7 @@ class DistributeTranspiler(object): program.global_block()._sync_with_cpp() return var_mapping - def create_splited_vars(self, source_var, block, tag): + def _create_splited_vars(self, source_var, block, tag): return [ block.create_var( name=str(source_var.name + tag + str(index)), @@ -1221,18 +1221,39 @@ class DistributeTranspiler(object): program = optimize_block.program pserver_block = program.global_block() new_inputs = dict() + # update param/grad shape first, then other inputs like # moment can use the updated shape + def _get_param_block(opt_op): + # param is already created on global program + param_block = None + for p in self.param_grad_ep_mapping[endpoint]["params"]: + if same_or_split_var(p.name, opt_op.input("Param")[0]): + param_block = p + break + return param_block + for key in opt_op.input_names: if key == "Grad": new_inputs[key] = merged_var + # For RMSProp optimizer + elif key == "Moment" or key == "MeanSquare": + param_block = _get_param_block(opt_op) + if not param_block: + return + moment_var = origin_program.global_block().vars[opt_op.input( + key)[0]] + tmpvar = pserver_block.create_var( + name=moment_var.name, + persistable=moment_var.persistable, + dtype=moment_var.dtype, + # change to use same shape as param + # TODO(typhoonzero): didn't append .block in the var name, + # may affect checkpoint saving? Need to verify. + shape=param_block.shape) + new_inputs[key] = tmpvar elif key == "Param": - # param is already created on global program - param_block = None - for p in self.param_grad_ep_mapping[endpoint]["params"]: - if same_or_split_var(p.name, opt_op.input(key)[0]): - param_block = p - break + param_block = _get_param_block(opt_op) if not param_block: return tmpvar = pserver_block.create_var( @@ -1258,7 +1279,7 @@ class DistributeTranspiler(object): for key in opt_op.input_names: new_shape = None - if key in ["Param", "Grad", "LearningRate"]: + if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]: continue var = self.origin_program.global_block().vars[opt_op.input(key)[0]] # update accumulator variable shape @@ -1281,7 +1302,7 @@ class DistributeTranspiler(object): type=opt_op.type, inputs=new_inputs, outputs=outputs, - attrs=opt_op.attrs) + attrs=opt_op.all_attrs()) def _is_splited_grad_var(self, var, var_dict): grad_block = None @@ -1314,7 +1335,7 @@ class DistributeTranspiler(object): block._clone_variable(var) return block.append_op( - type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs) + type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs()) def _append_pserver_non_opt_ops(self, optimize_block, opt_op): program = optimize_block.program @@ -1355,7 +1376,7 @@ class DistributeTranspiler(object): type=opt_op.type, inputs=inputs, outputs=outputs, - attrs=opt_op.attrs) + attrs=opt_op.all_attrs()) def _is_op_connected(self, op1, op2): # If one op's input is another op's output or @@ -1460,8 +1481,8 @@ class DistributeTranspiler(object): # optimize op_maker = core.op_proto_and_checker_maker optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize - if op_maker.kOpRoleAttrName() in op.attrs and \ - int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role): + if op_maker.kOpRoleAttrName() in op.attr_names and \ + int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role): return True return False @@ -1484,8 +1505,8 @@ class DistributeTranspiler(object): # and op_role_var to get the pair. for input_name in op.input_arg_names: if input_name.find("@GRAD") != -1 and \ - op.attrs[RPC_OP_ROLE_ATTR_NAME]: - param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0] + op.attr(RPC_OP_ROLE_ATTR_NAME): + param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0] params_grads.append([ origin_var_dict[param_name], origin_var_dict[input_name] diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index 142fa5c31d2c558e482001da73fa26d8396c3967..87f20bbccf3138585841952efacef5b0a3cbbace 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -57,10 +57,10 @@ class InferenceTranspiler(object): scope = global_scope() if not isinstance(scope, core.Scope): raise TypeError("scope should be as Scope type or None") - self.fuse_batch_norm(program, place, scope) - self.fuse_relu_mkldnn(program) + self._fuse_batch_norm(program, place, scope) + self._fuse_relu_mkldnn(program) - def fuse_relu_mkldnn(self, program): + def _fuse_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. @@ -104,7 +104,7 @@ class InferenceTranspiler(object): # And a better solution will be considered later. program = program.clone() - def fuse_batch_norm(self, program, place, scope): + def _fuse_batch_norm(self, program, place, scope): ''' Transpile the program by fused batch normalization. diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 1ec210f265049c8b62cd99cd218f25a9f846ef43..b9e602f324ad9bf43416b420c6d5697050a5c802 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -15,7 +15,7 @@ WMT14 dataset. The original WMT14 dataset is too large and a small set of data for set is provided. This module will download dataset from -http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and +http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz and parse training set and test set into paddle reader creators. """ @@ -37,8 +37,7 @@ URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/' MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' # this is a small set of data for test. The original data is too large and # will be add later. -URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' - 'wmt_shrinked_data/wmt14.tgz') +URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz') MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' # BLEU of this trained model is 26.92 URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz' diff --git a/python/requirements.txt b/python/requirements.txt index c091ecb111bda9d5e83c3ddcae93aed0745f9e4c..f8298a63612cb217ce0e711e78fffdf86b73313d 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,5 +1,5 @@ requests==2.9.2 -numpy>=1.12 +numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version protobuf==3.1 recordio>=0.1.0 matplotlib