diff --git a/Dockerfile b/Dockerfile index 84e1edbee91b0c3877ec7c55161af8c3bdf1c0dc..716b164ab84c1b1160b0fad29849d5d04e5174e1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3 install -U wheel && \ - pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip3.6 install -U wheel && \ - pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip3.7 install -U wheel && \ - pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ +RUN pip3 --no-cache-dir install -U wheel && \ + pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 --no-cache-dir install -U wheel && \ + pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 --no-cache-dir install -U wheel && \ + pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ easy_install -U pip && \ - pip install -U pip setuptools wheel && \ - pip install -U docopt PyYAML sphinx==1.5.6 && \ - pip install sphinx-rtd-theme==0.1.9 recommonmark - -RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3 install opencv-python && \ - pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.6 install opencv-python && \ - pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.7 install opencv-python && \ - pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip install opencv-python + pip --no-cache-dir install -U pip setuptools wheel && \ + pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 --no-cache-dir install opencv-python && \ + pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 --no-cache-dir install opencv-python && \ + pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 --no-cache-dir install opencv-python && \ + pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip --no-cache-dir install opencv-python #For docstring checker -RUN pip3 install pylint pytest astroid isort -RUN pip3.6 install pylint pytest astroid isort -RUN pip3.7 install pylint pytest astroid isort -RUN pip install pylint pytest astroid isort LinkChecker +RUN pip3 --no-cache-dir install pylint pytest astroid isort +RUN pip3.6 --no-cache-dir install pylint pytest astroid isort +RUN pip3.7 --no-cache-dir install pylint pytest astroid isort +RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ -RUN pip3 install -r /root/requirements.txt -RUN pip3.6 install -r /root/requirements.txt -RUN pip3.7 install -r /root/requirements.txt -RUN pip install -r /root/requirements.txt +RUN pip3 --no-cache-dir install -r /root/requirements.txt +RUN pip3.6 --no-cache-dir install -r /root/requirements.txt +RUN pip3.7 --no-cache-dir install -r /root/requirements.txt +RUN pip --no-cache-dir install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -RUN apt-get install -y libssl-dev libffi-dev -RUN pip3 install certifi urllib3[secure] -RUN pip3.6 install certifi urllib3[secure] -RUN pip3.7 install certifi urllib3[secure] -RUN pip install certifi urllib3[secure] +RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y +RUN pip3 --no-cache-dir install certifi urllib3[secure] +RUN pip3.6 --no-cache-dir install certifi urllib3[secure] +RUN pip3.7 --no-cache-dir install certifi urllib3[secure] +RUN pip --no-cache-dir install certifi urllib3[secure] # Install woboq_codebrowser to /woboq diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index c29375cd0589764507f9a10859a6d4fdbb29716e..a9b99e9ab87c724ac7062e3a20b247bf6ea44634 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -106,10 +106,10 @@ else(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} - DEPENDS mkldnn) + DEPENDS mkldnn shared_mkldnn) endif(WIN32) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) - +ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn) IF(WITH_C_API) INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib) ENDIF() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 48279bc809dde9e97c967c3ea5d03fbd7b89b017..3e11d332ff71098adf65e487a39351ae57427e9e 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -136,7 +136,7 @@ if (WITH_MKLDNN) copy(mkldnn_lib SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} DSTS ${dst_dir} ${dst_dir}/lib - DEPS mkldnn + DEPS mkldnn_shared_lib ) endif () diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 86096d4feaace040da416a01872882456c4098fc..566dc75fda019eb66759eb403f60e16f18cffef1 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,46 +57,43 @@ int main() return 0; }" SSE3_FOUND) -# disable AVX by default on windows -if(NOT WIN32) - # Check AVX - set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) - set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; - }" AVX_FOUND) +# Check AVX +set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; +}" AVX_FOUND) - # Check AVX 2 - set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) - set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; - }" AVX2_FOUND) +# Check AVX 2 +set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; +}" AVX2_FOUND) - # Check AVX512F - set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) - set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; - }" AVX512F_FOUND) -endif(NOT WIN32) +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; +}" AVX512F_FOUND) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py index 8d95dc0591e1d6bd815cc697528191c2ee8c1cfe..500f64bed9898fa874cbad2ea69aae05df58023e 100644 --- a/paddle/contrib/float16/float16_transpiler.py +++ b/paddle/contrib/float16/float16_transpiler.py @@ -60,7 +60,7 @@ class Float16Transpiler: raise TypeError("place should be as CPUPlace/CUDAPlace type") if scope is None: scope = global_scope() - if not isinstance(scope, core.Scope): + if not isinstance(scope, core._Scope): raise TypeError("scope should be as Scope type or None") self.scope = scope diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b6974c6af290438f827c16bb478eb43e3cf42247..9872631553056fbabd8be8162ecee16250b33862 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) -paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None -paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None -paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable -paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope -paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 9b09a03f2b9e4a9927c9d32523afefe19d6e8e93..f9c90e05c3b5e0d3f4ac6d5a62b3efb4509d0259 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -7,27 +7,17 @@ function(windows_symbolic TARGET) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) - get_filename_component(src ${src} NAME_WE) - if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) - message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") - endif() - -#only copy the xx.cu to.xx.cu when the content are modified - set(copy_flag 1) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) - if (SOURCE_STR STREQUAL TARGET_STR) - set(copy_flag 0) - endif() - endif() - if (copy_flag) - add_custom_command(OUTPUT .${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" - COMMENT "create hidden file of ${src}.cu") - endif(copy_flag) - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + endif() + + file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) + + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() endfunction() @@ -78,17 +68,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) -cc_test(variable_test SRCS variable_test.cc) - cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(scope SRCS scope.cc DEPS glog threadpool) +cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) +if (WITH_GPU) + target_link_libraries(var_type_traits dynload_cuda) +endif() +cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) + +cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits) +cc_library(scope_pool SRCS scope_pool.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope) +cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) nv_test(data_device_transform_test SRCS data_device_transform_test.cu - DEPS operator op_registry device_context math_function) + DEPS operator op_registry device_context math_function scope) if(WITH_GPU) if (WIN32) diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index c9ec5e7a7b37b62efbf3d980e93b5518364d99c9..96a2f9250ff928fe58a5339a25c68c9db515522d 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index abacb11e3b018308c20a67630e3ff34cca7d3387..03fbfd7f24a8a987db72f45be777acc7ece577a6 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() { } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 949510e03705a4a0900f1c7b8758a8f7308aa44b..872bc5d654cd66db821e56031d878815b653645c 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ClearFetchOp(graph_.get(), &fetch_ops); return fetches; } + void FastThreadedSSAGraphExecutor::RunOpAsync( std::unordered_map> *op_deps, OpHandleBase *op, diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 2cbfa24951a3d2f07e3c2f41b3f18b2d882b0683..211668b871fca0d14e2bf34101fc98bc8bb6a7a0 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -42,6 +42,12 @@ namespace { typedef std::vector GraphOps; const char kGraphOps[] = "ops"; +bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) { + return boost::get( + node.Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(role); +} + void PolishGraphToSupportDataHazards(ir::Graph *graph) { for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { @@ -147,6 +153,7 @@ void MultiDevSSAGraphBuilder::Init() const { #endif balance_vars_.resize(places_.size(), 0); + if (strategy_.enable_data_balance_ && places_.size() == 1) { LOG(WARNING) << "It is no need to enable data balance when there is only " "one place. enable_data_balance is set to False."; @@ -154,145 +161,16 @@ void MultiDevSSAGraphBuilder::Init() const { } } -void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, - ir::Node *node, - size_t place_id) const { - auto p = places_[place_id]; - auto *op_handle = result->Get(kGraphOps).back(); - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); - - for (ir::Node *input : node->inputs) { - VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); - op_handle->AddInput(var); - } - - for (ir::Node *output : node->outputs) { - ir::Node *new_node = nullptr; - if (output->Var()) { - new_node = result->CreateVarNode(output->Var()); - } else { - new_node = - result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); - } - CreateOpOutput(result, op_handle, new_node, p, place_id); - } -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( - const std::vector &nodes) const { - std::vector send_vars; - // since parameters are all in block 0, - // it's enough to only scan send ops in block 0 - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find send op, - // instead of the the hard code string - if (op->Type() == "send") { - auto op_vars = op->InputArgumentNames(); - send_vars.reserve(send_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return send_vars; -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainRecvVars( - const std::vector &nodes) const { - std::vector recv_vars; - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find recv op, - // instead of the hard code string - if (op->Type() == "recv") { - auto op_vars = op->OutputArgumentNames(); - recv_vars.reserve(recv_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return recv_vars; -} - -size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( - const std::vector &var_names) const { - int64_t numel_sum = 0; - for (auto var_name : var_names) { - if (all_vars_.find(var_name) == all_vars_.end()) continue; - auto var_desc = all_vars_.at(var_name); - PADDLE_ENFORCE_NOT_NULL(var_desc); - auto dim = framework::make_ddim(var_desc->GetShape()); - int64_t numel = framework::product(dim); - PADDLE_ENFORCE_GT(numel, 0); - numel_sum += numel; - } - - auto smallest = - std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); - size_t dev_id = - static_cast(std::distance(std::begin(balance_vars_), smallest)); - balance_vars_[dev_id] += numel_sum; - return dev_id; -} - -// Topology sort the graph nodes from inputs to outputs. -// Since SSAGraphBuilder depends on forward/backward nodes to assign devices -// to parameter/gradients before optimizer ops, topo sort is insufficient. ( -// some optimizer ops might not depend on any nodes), we manually move all -// optimizer nodes after last backward nodes. -// However, the assumption by SSAGraphBuilder should be relaxed in the future. -std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { - std::vector ret = ir::TopologySortOperations(graph); - size_t last_backward = 0; - for (size_t i = 0; i < ret.size(); ++i) { - if (boost::get( - ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kBackward)) { - last_backward = i; - } - } - - std::vector optimize_ops; - std::vector sorted_ret; - for (size_t i = 0; i < ret.size(); ++i) { - if (i < last_backward) { - if (static_cast(boost::get(ret[i]->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kOptimize))) { - optimize_ops.push_back(ret[i]); - } else { - sorted_ret.push_back(ret[i]); - } - } else if (i == last_backward) { - sorted_ret.push_back(ret[i]); - // Verify that no operations before optimize ops depends on optimize ops. - std::unordered_set optimize_set(optimize_ops.begin(), - optimize_ops.end()); - for (ir::Node *n : sorted_ret) { - for (ir::Node *in : n->inputs) { - for (ir::Node *pre_n : in->inputs) { - PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(), - "optimize operations cannot be depended by forward " - "or backward node %s -> %s", - pre_n->Name(), n->Name()); - } - } - } - sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(), - optimize_ops.end()); - } else { - sorted_ret.push_back(ret[i]); - } - } - return sorted_ret; -} - std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unique_ptr graph) const { Init(); // Give the topology sort order and rebuild the graph structure. - std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); + std::vector sorted_ops = ir::TopologySortOperations(*graph); + + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + sorted_ops = SortForReduceMode(sorted_ops); + } + auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; @@ -303,31 +181,22 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( all_vars_.emplace(node->Name(), node->Var()); } } - std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 result.Set(kGraphVars, new GraphVars(places_.size())); result.Set(kGraphDepVars, new GraphDepVars); result.Set(kGraphOps, new GraphOps); - // find send/recv vars so that we can place the distributed training - // related op in the place 0 - auto send_vars = FindDistTrainSendVars(sorted_ops); - auto recv_vars = FindDistTrainRecvVars(sorted_ops); - std::vector> bcast_var_name_set; bcast_var_name_set.resize(places_.size()); - size_t cur_device_id = 0; bool is_forwarding = true; bool is_dist_train = false; std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { - if (boost::get( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kRPC)) { + if (OpHaveRole(*node, OpRole::kRPC)) { int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device); PADDLE_ENFORCE(op_dev_id != -1, "Can not schedule the RPC operator to the right place."); @@ -341,9 +210,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } is_dist_train = true; - } else if (boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kDist)) { + } else if (OpHaveRole(*node, OpRole::kDist)) { int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device); if (node->Op()->Type() == "concat") { auto origin_param_name = node->Op()->OutputArgumentNames()[0]; @@ -365,7 +232,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); + int op_dev_id = GetOpDeviceID(node, sharded_var_device); if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { @@ -384,48 +251,49 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - if (!is_forwarding && num_parallel_devices > 1) { + if (!is_forwarding && num_parallel_devices > 1UL) { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. - if (static_cast(boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward))) { - try { - auto backward_vars = boost::get>( - node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &p_name = backward_vars[i]; - auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - - switch (strategy_.reduce_) { - case BuildStrategy::ReduceStrategy::kReduce: - cur_device_id = GetAppropriateDeviceID({g_name}); - CreateReduceOp(&result, g_name, cur_device_id); - sharded_var_device.emplace(g_name, cur_device_id); - if (!is_dist_train) { - bcast_var_name_set[cur_device_id].emplace(p_name); - } - break; - case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(g_name)) { - CreateReduceOp(&result, g_name, 0); - CreateBroadcastOp(&result, g_name, 0); - } else { - InsertAllReduceOp(&result, g_name); - } - break; - default: - LOG(FATAL) << "Unknown reduce strategy "; - break; - } + try { + auto backward_vars = boost::get>( + node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + size_t cur_device_id = -1; + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kReduce: + cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(&result, g_name, cur_device_id); + sharded_var_device.emplace(g_name, cur_device_id); + if (!is_dist_train) { + bcast_var_name_set[cur_device_id].emplace(p_name); + } + break; + case BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(g_name)) { + CreateReduceOp(&result, g_name, 0); + CreateBroadcastOp(&result, g_name, 0); + } else { + InsertAllReduceOp(&result, g_name); + } + break; + default: + LOG(FATAL) << "Unknown reduce strategy "; + break; } - } catch (boost::bad_get e) { } + } catch (boost::bad_get e) { } } } @@ -469,12 +337,108 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( return graph; } -bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { - PADDLE_ENFORCE(all_vars_.count(og) != 0); - if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { - return true; +std::vector MultiDevSSAGraphBuilder::SortForReduceMode( + const std::vector &topo_ops) const { + std::unordered_map sharded_var_device; + std::vector sorted_ops; + std::unordered_map> delayed_op; + sorted_ops.reserve(topo_ops.size()); + + auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { + sharded_var_device.emplace(var_name, dev_id); + if (delayed_op.count(var_name)) { + auto &ops = delayed_op.at(var_name); + sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); + delayed_op.at(var_name).clear(); + } + }; + + for (ir::Node *node : topo_ops) { + int op_dev_id = GetOpDeviceID(node, sharded_var_device, &delayed_op); + if (op_dev_id > -1) { + // This op only runs on one specific device. + sorted_ops.emplace_back(node); + for (ir::Node *n : node->outputs) { + insert_delayed_op(n->Name(), op_dev_id); + } + } else if (op_dev_id == -1) { + // This op runs on all devices, and its output may have parameter's + // gradients. + sorted_ops.emplace_back(node); + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + std::vector backward_vars; + try { + backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + } catch (boost::bad_get e) { + } + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &g_name = backward_vars[i + 1]; + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + insert_delayed_op(g_name, static_cast(cur_device_id)); + } + } else if (op_dev_id == -2) { + // The Op on which the Op depends has not yet been generated. + } } - return false; + + PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); + return sorted_ops; +} + +void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, + ir::Node *node, + size_t place_id) const { + auto p = places_[place_id]; + auto *op_handle = result->Get(kGraphOps).back(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + + for (ir::Node *input : node->inputs) { + VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); + op_handle->AddInput(var); + } + + for (ir::Node *output : node->outputs) { + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, place_id); + } +} + +size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + if (all_vars_.find(var_name) == all_vars_.end()) continue; + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; } void MultiDevSSAGraphBuilder::SetCommunicationContext( @@ -625,28 +589,52 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( } int MultiDevSSAGraphBuilder::GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, + ir::Node *node, + const std::unordered_map &sharded_var_device, + std::unordered_map> *delay_ops) const { + if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { + return -1; + } + + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); + + if (dev_id == -1) { + (*delay_ops)[param_grad[1]].push_back(node); + return -2; + } + return dev_id; +} + +int MultiDevSSAGraphBuilder::GetOpDeviceID( + ir::Node *node, const std::unordered_map &sharded_var_device) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } - int op_role = boost::get( - node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); - if (op_role != static_cast(framework::OpRole::kOptimize)) { + + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { return -1; } auto param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device); + int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", node->Op()->Type(), param_grad[0], param_grad[1]); return dev_id; } int MultiDevSSAGraphBuilder::GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, + const std::string &varname, const std::unordered_map &sharded_var_device) const { auto got = sharded_var_device.find(varname); if (got == sharded_var_device.end()) { @@ -741,8 +729,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp( node->Op()->Type() == "split_selected_rows" || node->Op()->Type() == "split_ids") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); + op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { @@ -753,8 +740,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp( sharded_var_device->emplace(varname, op_dev_id); } } else if (node->Op()->Type() == "concat") { - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); + op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); for (auto &varname : output_var_names) { sharded_var_device->emplace(varname, op_dev_id); } @@ -795,8 +781,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device); + op_dev_id = GetVarDeviceID(node->inputs[0]->Name(), *sharded_var_device); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -826,8 +811,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); if (recv_param_grad.size() == 2U) { - op_dev_id = - GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device); + op_dev_id = GetVarDeviceID(recv_param_grad[1], *sharded_var_device); VLOG(10) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] << " place: " << op_dev_id; @@ -862,8 +846,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( for (ir::Node *output : node->outputs) { int outvar_dev_id = op_dev_id; if (node->Op()->Type() == "fetch_barrier") { - outvar_dev_id = - GetVarDeviceID(*result, output->Name(), *sharded_var_device); + outvar_dev_id = GetVarDeviceID(output->Name(), *sharded_var_device); PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); } p = places_[outvar_dev_id]; @@ -880,6 +863,14 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( return op_dev_id; } +bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { + return true; + } + return false; +} + bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { return boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 5736102ddc13418446013307cf8204b677f960dc..7029e9dc18cbacf0c5f0d7c6430d84fb72d6a0a3 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { #endif int GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, + const std::string &varname, const std::unordered_map &sharded_var_device) const; bool IsScaleLossOp(ir::Node *node) const; @@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ir::Graph *result, ir::Node *node, std::unordered_map *sharded_var_device) const; - std::vector FindDistTrainSendVars( - const std::vector &nodes) const; - - std::vector FindDistTrainRecvVars( - const std::vector &nodes) const; - void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; @@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { int dev_id) const; int GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, + ir::Node *node, const std::unordered_map &sharded_var_device) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; @@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void SetCommunicationContext(OpHandleBase *op_handle, const platform::Place &p) const; + std::vector SortForReduceMode( + const std::vector &) const; + + int GetOpDeviceID( + ir::Node *node, + const std::unordered_map &shared_var_device, + std::unordered_map> *delay_ops) + const; + mutable std::string loss_var_name_; mutable std::vector places_; mutable std::vector local_scopes_; diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 3dfd14419d94379a0bf79f55d7a139acd77cbd7e..134f759081a0778194c20785e215420d6e2bb622 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) { } else if (var->IsType()) { (*func)(var->GetMutable()); } else { - PADDLE_THROW("Not supported type %s", var->Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var->Type())); } } @@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) { } else if (var.IsType()) { (*func)(var.Get()); } else { - PADDLE_THROW("Not supported type %s", var.Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index da9556c6c1f3468208db02f2958ad6ad137c6566..594fbb48a6d12715fa42494e30bc8f50fbe171ef 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -119,7 +119,7 @@ static void DeleteUnusedTensors( } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index b7f7e2ee8ef590c0d0d8307de4400a8ce8ad4e7d..6d795e1e2d5407ecacf5fb4af539919d72bff404 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -45,6 +45,7 @@ pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) +pass_library(conv_affine_channel_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..a7bfb8cf1ee09e78051e2f140c9a7ab4c40db60c --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_CONV_BN_NODES(pattern_name) \ + /* OPERATORS */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \ + /* CONV inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ + /* CONV outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ + /* Affine Channel inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \ + /* Affine channel outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */ + +void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, + const ir::Node& ac_scale, + const LoDTensor& ac_bias_tensor, + LoDTensor* eltwise_y_in_tensor) { + using EigenVectorArrayMap = + Eigen::Map>; + using ConstEigenVectorArrayMap = + Eigen::Map>; + using EigenMatrixArrayMap = Eigen::Map< + Eigen::Array>; + + // Re-compute bias of conv2d from AffineChannel + PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims()); + + auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable(); + + ConstEigenVectorArrayMap scale_array(scale_tensor->data(), + scale_tensor->numel(), 1); + ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data(), + ac_bias_tensor.numel(), 1); + + EigenVectorArrayMap eltwise_y_in_array( + eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 1); + + eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array; + + // Re-compute weight of conv2d from AffineChannel + auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); + auto weights_shape = weights->dims(); + auto weights_shape_2d = flatten_to_2d(weights_shape, 1); + + EigenMatrixArrayMap weights_array_2d( + weights->mutable_data(platform::CPUPlace()), weights_shape_2d[0], + weights_shape_2d[1]); + + weights_array_2d.colwise() *= scale_array; +} + +std::unique_ptr ConvAffineChannelFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, false /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvAffineChannel fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + + // check if fuse can be done and if MKL-DNN should be used + FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel); + if (fuse_option == DO_NOT_FUSE) { + VLOG(3) << "do not perform conv+affinechannel fuse"; + return; + } + + // Create eltwise_y (conv bias) variable + VarDesc eltwise_y_in_desc( + patterns::PDNodeName(name_scope_, "eltwise_y_in")); + eltwise_y_in_desc.SetPersistable(true); + auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); + auto* eltwise_y_in_tensor = + scope->Var(eltwise_y_in_node->Name())->GetMutable(); + + // Get affine_channel bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + // Initialize eltwise_y + eltwise_y_in_tensor->Resize(ac_bias_tensor->dims()); + std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 0.0f); + + // update weights and biases + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // create an elementwise add node. + OpDesc desc; + desc.SetInput("X", std::vector({conv_out->Name()})); + desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); + desc.SetOutput("Out", std::vector({ac_out->Name()})); + desc.SetType("elementwise_add"); + desc.SetAttr("axis", 1); + auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. + + GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel}); + + IR_NODE_LINK_TO(conv_out, eltwise_op); + IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); + IR_NODE_LINK_TO(eltwise_op, ac_out); + found_conv_ac_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_conv_ac_count); + return graph; +} + +std::unique_ptr ConvEltwiseAddAffineChannelFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, true /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBN fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + // OPERATORS + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern); + // BIAS inputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern); + // BIAS outputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern); + + // Get eltwise_y (conv bias) variable + auto* eltwise_y_in_tensor = + scope->FindVar(eltwise_y_in->Name())->GetMutable(); + + // Get batch norm bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // Update the elementwise_add node + eltwise->Op()->SetAttr("axis", 1); + eltwise->Op()->SetOutput("Out", std::vector({ac_out->Name()})); + + GraphSafeRemoveNodes(graph.get(), + {ac_scale, ac_bias, affine_channel, eltwise_out}); + + IR_NODE_LINK_TO(eltwise, ac_out); + + found_conv_ac_count++; + }; + + gpd(graph.get(), handler); + AddStatis(found_conv_ac_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_affine_channel_fuse_pass, + paddle::framework::ir::ConvAffineChannelFusePass); +REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, + paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..ad966e11e6222a4ed4c730089c454b0d1c7bd0b3 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the Conv and ConvAffineChannel. + */ +class ConvAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvAffineChannelFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_affine_channel_fuse"}; +}; + +class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvEltwiseAddAffineChannelFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index 23f343f631628b432d91d4504019895ed4bac4a5..c6121777e8d2c32193b5c170bb0fa3f0337c9bc3 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc( const std::string& output) { auto proto = base_desc; framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); desc.SetInput("Bias", {bias}); desc.SetInput("ResidualData", {bias1}); desc.SetAttr("activation", activation); desc.SetOutput("Output", {output}); desc.SetAttr("is_test", true); - + desc.SetAttr("use_cudnn", false); + desc.Flush(); return *desc.Proto(); } std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( std::unique_ptr graph) const { - const std::string pattern_name = "conv_elementwise_add_act_fuse"; + const std::string pattern_name = "conv_elementwise_add2_act_fuse"; FusePassBase::Init(pattern_name, graph.get()); GraphPatternDetector gpd; @@ -76,22 +78,23 @@ std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( framework::OpDesc new_op_desc(new_op_proto, nullptr); // Create a new node for the fused op. - graph->CreateOpNode(&new_op_desc); + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); // Link inputs and outputs. PADDLE_ENFORCE(subgraph.count(x)); auto* conv_in_node = subgraph.at(x); - IR_NODE_LINK_TO(conv_in_node, conv_op); // Input - IR_NODE_LINK_TO(conv_filter, conv_op); // Filter - IR_NODE_LINK_TO(conv_op, conv_out); // Output - IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias - IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), - {conv_op, elementwise_add_op, elementwise_add_op_1, - elementwise_add_out}); + GraphSafeRemoveNodes( + graph.get(), + {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out, elementwise_add_out_1, act_op}); }; gpd(graph.get(), handler); return graph; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8670dcfed7e40473e06cd12cecc1157dd4f54aa0..3eb5bdba3b7275f45cdfc6ad47f75e7a423541d0 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -23,66 +23,8 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { -namespace { - -void CheckProgram(const ProgramDesc &program) { -#define _INT(role) static_cast(role) - - std::map visit; - for (OpDesc *op : program.Block(0).AllOps()) { - // For backward compatibility, some program doesn't have role added. - if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; - int role_id = - boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - visit[role_id] = true; - switch (role_id) { - case _INT(OpRole::kForward): - if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) << "Cannot add backward operator before forward operator " - << op->Type(); - } - break; - case _INT(OpRole::kBackward): - case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add backward operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kForward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | - _INT(OpRole::kLoss)) == visit.end(), - "Cannot add backward|loss operator before " - "forward|loss operator %s.", - op->Type()); - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add forward|loss operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kOptimize): - case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), - "Optimize operators %s must follow backward operator.", - op->Type()); - break; - case _INT(OpRole::kLRSched): - case _INT(OpRole::kDist): - case _INT(OpRole::kRPC): - case _INT(OpRole::kNotSpecified): - break; - default: - LOG(FATAL) << "Unknown operator role. Don't add new role because " - "you don't know what you are doing."; - } - } - -#undef _INT -} -} // namespace Graph::Graph(const ProgramDesc &program) : program_(program) { - CheckProgram(program_); auto var_nodes = InitFromProgram(program_); ResolveHazard(var_nodes); } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 13d752e5167c039ec8d9e4300b190a726bb02a63..c513fe2dd8f5733c87802f6fa9980ad885dfd865 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } -std::unordered_set conv_act_set({"identity", "sigmoid", "relu", - "relu6", "relux", "tanh", - "band_pass"}); +std::unordered_set conv_act_set({"identity", "relu"}); PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { conv_in->AsInput(); @@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { ->AsInput(); auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) ->assert_is_op_output("elementwise_add") - ->assert_is_op_input("elementwise_add", "X") + ->assert_is_op_input("elementwise_add", "Y") ->AsIntermediate(); auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) ->assert_is_op("elementwise_add"); auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) - ->assert_is_op_input("elementwise_add", "Y") + ->assert_is_op_input("elementwise_add", "X") ->AsInput(); auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) ->assert_is_op_output("elementwise_add") @@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) .LinksTo({elementwise_add_out}); - elementwise_add_op_1->LinksFrom( - {elementwise_add_out, elementwise_add_in_y_1}); + elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1}) + .LinksTo({elementwise_add_out_1}); act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); return act_out; } @@ -1236,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { return elementwise_add_out; } +PDNode *patterns::ConvAffineChannel::operator()( + paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + + PDNode *eltwise_op = nullptr; + if (with_eltwise_add) { + eltwise_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + } + + auto *affine_channel_op = + pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel"); + // Create variables + // Conv Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d"); + + PDNode *eltwise_y_in_var = nullptr; + PDNode *eltwise_out_var = nullptr; + if (with_eltwise_add) { + // Conv output as Bias input + conv_out_var->assert_is_op_input("elementwise_add", "X"); + // Bias + eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("elementwise_add"); + } else { + // Conv output as AffineChannel input + conv_out_var->assert_is_op_input("affine_channel", "X"); + } + + // AC Scale + auto *ac_scale_var = pattern->NewNode(ac_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("affine_channel", "Scale"); + // AC Bias + auto *ac_bias_var = pattern->NewNode(ac_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("affine_channel", "Bias"); + + // AC output + auto *ac_out_var = pattern->NewNode(ac_out_repr()) + ->AsOutput() + ->assert_is_op_output("affine_channel"); + + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + + if (with_eltwise_add) { + eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var}) + .LinksTo({eltwise_out_var}); + affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } else { + affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } + return ac_out_var; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index eaedd9d08e0fab820481d6eaacb6e7bfc1ab6d1d..61a53003449710da2a52c90197c9f2f3ac56c7bb 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase { PATTERN_DECL_NODE(elementwise_add_out); }; +// Conv with affine_channel +// op: conv + (elementwise_add +) affine_channel +// named nodes: +// conv_weight, conv_out, conv, +// ac_x, ac_scale, ac_bias +// affine_channel, ac_out +struct ConvAffineChannel : public PatternBase { + ConvAffineChannel(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_affine_channel") {} + + PDNode* operator()(PDNode* conv_input, bool with_eltwise_add); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(affine_channel); + PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD + // CONV inputs + PATTERN_DECL_NODE(conv_weight); // Filter + // CONV outputs + PATTERN_DECL_NODE(conv_out); // tmp + // ELTWISE inputs + PATTERN_DECL_NODE(eltwise_y_in); + // ELTWISE outputs + PATTERN_DECL_NODE(eltwise_out); // tmp + + // AC(Affine_Channel) inputs + PATTERN_DECL_NODE(ac_scale); + PATTERN_DECL_NODE(ac_bias); + // AC outputs + PATTERN_DECL_NODE(ac_out); // Out +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 6940250c3f9663bbb734d5a6eb78135aecbc3a3b..c3a044d22cf04dceecc164fae934ee15c4563af1 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable memory::AllocationPtr gpu_; + mutable paddle::memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 2311614c335a56501ac777d787f6653659294765..ca31303f77c4a30eb64c43404e214779ea78aeaf 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") .SetDefault(""); - AddAttr>(OpCreationCallstackAttrName(), - "Callstack for Op Creatation.") - .SetDefault({}); - Validate(); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 0a0f8f4655bc34cdb25205ff6eaec9f96c801ebd..4c59c73d8779eceb267ad532aabccabbd54b0df2 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker { static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpNamescopeAttrName() { return "op_namescope"; } - static const char *OpCreationCallstackAttrName() { return "op_callstack"; } void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 6d39bb3c524b4725dfebd6ef07594b0b45c65463..2c1648c81fc999c6306d5b08bc243f3ad21fec04 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,7 +23,8 @@ limitations under the License. */ #include #include -#include "glog/logging.h" // For VLOG() +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ac2828136b45159b84b2e00e5d2310e7946976f4..f659a7b93fe06341b3c94aa07c526c982ff54483 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -16,15 +16,10 @@ limitations under the License. */ #include #include -#include -#include -#include -#include "gflags/gflags.h" -#include "glog/logging.h" + #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/transfer_scope_cache.h" @@ -162,67 +157,31 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { - try { - if (VLOG_IS_ON(4)) { - VLOG(4) << place << " " << DebugStringEx(&scope); - } - if (platform::is_gpu_place(place)) { + VLOG(4) << place << " " << DebugStringEx(&scope); + if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("Cannot run operator on place %s", place); + PADDLE_THROW("Cannot run operator on place %s", place); #else - auto dev_id = boost::get(place).device; - platform::SetDeviceId(dev_id); + auto dev_id = boost::get(place).device; + platform::SetDeviceId(dev_id); #endif - } - - // The profile has a process-wide mutex, results in serious performance - // issue - // in concurrency scenerio. Here use an `if` to fix this issue. - // Please not remove the `if`, ask @Superjomn if there are any concern. - if (platform::IsProfileEnabled()) { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - RunImpl(scope, place); - } else { - RunImpl(scope, place); - } - - if (VLOG_IS_ON(3)) { - VLOG(3) << place << " " << DebugStringEx(&scope); - } - } catch (platform::EnforceNotMet exception) { - if (Attrs().count("sub_block") != 0) { - throw exception; - } - - auto& callstack = Attr>( - OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + } - if (callstack.empty()) { - throw exception; - } - std::ostringstream sout; - sout << "Invoke operator " << Type() << " error.\n"; - sout << "Python Callstacks: \n"; - for (auto& line : callstack) { - sout << line; - } - sout << "C++ Callstacks: \n"; - sout << exception.err_str_; - exception.err_str_ = sout.str(); - throw exception; - } catch (...) { - std::rethrow_exception(std::current_exception()); + // The profile has a process-wide mutex, results in serious performance issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + } else { + RunImpl(scope, place); } + VLOG(3) << place << " " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { - if (inputs_.find(name) != inputs_.end()) { - return true; - } else { - return false; - } + return inputs_.find(name) != inputs_.end(); } std::string OperatorBase::Input(const std::string& name) const { @@ -421,7 +380,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { return &(var.Get().value()); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var.Type().name()); + ToTypeName(var.Type())); } } @@ -432,7 +391,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) { return var->GetMutable()->mutable_value(); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -526,7 +485,7 @@ const std::vector ExecutionContext::MultiInput( PADDLE_ENFORCE( var->IsType(), "should be LoDTensor, but the received type is %s", - var->Type().name()); + ToTypeName(var->Type())); return &(var->Get()); }); return res; @@ -545,7 +504,7 @@ const std::vector ExecutionContext::LegacyMultiInput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return &(var->Get()); }); return res; @@ -574,7 +533,7 @@ std::vector ExecutionContext::MultiOutput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return var->GetMutable(); }); return res; @@ -816,7 +775,7 @@ class RuntimeInferShapeContext : public InferShapeContext { PADDLE_THROW( "Only LoDTensor/SelectedRows support 'GetDim', but Variables " "type_id is %s.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -839,7 +798,7 @@ class RuntimeInferShapeContext : public InferShapeContext { var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + ToTypeName(var->Type())); } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1fe2daacf1369902cde732422b4e65c3d156250f..bad9716e8b40916dabb5914277c3663532a3e964 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@"; /// e.g. Variable "x@GRAD" is the gradient of varibale "x". constexpr char kGradVarSuffix[] = "@GRAD"; +constexpr size_t kGradVarSuffixSize = 5U; + /// Variables with this suffix are supposed to be filled up with zeros. constexpr char kZeroVarSuffix[] = "@ZERO"; @@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; extern std::vector> kKernelPriority; inline std::string GradVarName(const std::string& var_name) { - return var_name + kGradVarSuffix; + std::string result; + result.reserve(var_name.size() + kGradVarSuffixSize); + result += var_name; + result += kGradVarSuffix; + return result; } proto::VarType::Type GetDataTypeOfVar(const Variable* var); @@ -110,8 +116,8 @@ class OperatorBase { bool HasAttr(const std::string& name) const { return attrs_.count(name); } template inline const T& Attr(const std::string& name) const { - PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", - name); + PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(), + "%s should be in AttributeMap", name); return boost::get(attrs_.at(name)); } const AttributeMap& Attrs() const { return attrs_; } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 45e34cab4c00695ee67d176487d2030ac8928cff..fd566be44c72bd7e5874b677fa9db11f467050d5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -367,6 +367,7 @@ void ParallelExecutor::BCastParamsToDevices( if (paddle::platform::is_gpu_place(main_tensor.place())) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::vector buffers; + buffers.reserve(member_->places_.size()); size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); for (size_t i = 0; i < member_->places_.size(); ++i) { @@ -400,9 +401,7 @@ void ParallelExecutor::BCastParamsToDevices( #endif } else { platform::CPUPlace cpu; - for (size_t i = 0; i < member_->places_.size(); ++i) { - if (i == 0) continue; - + for (size_t i = 1; i < member_->places_.size(); ++i) { auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 6fa5e99f9f3a7e871f1a742a30803853988ea6eb..750b626603178d2d2360c74b7b6530fa7cfe47b0 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -165,11 +165,9 @@ std::string Scope::Rename(const std::string& origin_name) const { Variable* Scope::VarInternal(const std::string& name) { auto* v = FindVarLocally(name); if (v != nullptr) return v; - v = new Variable(); - vars_[name].reset(v); + vars_.emplace(name, std::unique_ptr(v)); VLOG(3) << "Create variable " << name; - v->name_ = &(vars_.find(name)->first); return v; } diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..5cb241a7a341d793d8450f0c9cde3929acef8965 --- /dev/null +++ b/paddle/fluid/framework/scope_pool.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/threadpool.h" + +namespace paddle { +namespace framework { + +ScopePool &ScopePool::Instance() { // NOLINT + static ScopePool pool; + return pool; +} + +void ScopePool::DeleteScope(Scope *scope) { delete scope; } + +void ScopePool::Insert(std::unique_ptr &&s) { + std::lock_guard guard(mtx_); + scopes_.insert(s.release()); +} + +void ScopePool::Remove(Scope *s) { + size_t has_scope; + { + std::lock_guard guard(mtx_); + has_scope = scopes_.erase(s); + } + PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope"); + DeleteScope(s); +} + +ScopePool::~ScopePool() { Clear(); } + +void ScopePool::Clear() { + std::lock_guard guard(mtx_); + for (auto *s : scopes_) { + DeleteScope(s); + } + scopes_.clear(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope_pool.h b/paddle/fluid/framework/scope_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..a8b468699abe148d44a395cf888158aefab4380b --- /dev/null +++ b/paddle/fluid/framework/scope_pool.h @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +class ScopePool { + public: + static ScopePool &Instance(); // NOLINT + + void Insert(std::unique_ptr &&s); + + void Remove(Scope *s); + + void Clear(); + + ~ScopePool(); + + private: + ScopePool() = default; + + static void DeleteScope(Scope *scope); + + std::unordered_set scopes_; + std::mutex mtx_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 3b6f1cdb8f24ab20bfc80eeeba88891d7b41d1f9..73be446f71f193bea203c986b482e6b98a9826c5 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -19,52 +19,50 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/variable.h" namespace paddle { namespace framework { template -inline bool IsType(const std::type_index& type_index) { - return type_index == std::type_index(typeid(T)); +inline bool IsType(const std::type_index& type) { + return type == typeid(T); } -inline proto::VarType::Type ToVarType(std::type_index type) { - if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_RANK_TABLE; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR_ARRAY; - } else if (IsType(type)) { - return proto::VarType_Type_SELECTED_ROWS; - } else if (IsType(type)) { - return proto::VarType_Type_READER; - } else { - PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); +inline proto::VarType::Type ToVarType(int type) { + switch (type) { + case proto::VarType::LOD_TENSOR: + case proto::VarType::SELECTED_ROWS: + case proto::VarType::LOD_RANK_TABLE: + case proto::VarType::LOD_TENSOR_ARRAY: + case proto::VarType::READER: + return static_cast(type); + default: + PADDLE_THROW("ToVarType:Unsupported type %d", type); } } template inline void VisitVarType(const framework::Variable& var, Visitor visitor) { - switch (ToVarType(var.Type())) { - case proto::VarType_Type_LOD_TENSOR: + switch (var.Type()) { + case proto::VarType::LOD_TENSOR: visitor(var.Get()); return; - case proto::VarType_Type_LOD_RANK_TABLE: + case proto::VarType::LOD_RANK_TABLE: visitor(var.Get()); return; - case proto::VarType_Type_LOD_TENSOR_ARRAY: + case proto::VarType::LOD_TENSOR_ARRAY: visitor(var.Get()); return; - case proto::VarType_Type_SELECTED_ROWS: + case proto::VarType::SELECTED_ROWS: visitor(var.Get()); return; - case proto::VarType_Type_READER: + case proto::VarType::READER: visitor(var.Get()); return; default: - PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); + PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc index 7842168f603885ce7dc87d2a01dfa4f544389faa..2a75394fca719196a9d53894b080598e942baa45 100644 --- a/paddle/fluid/framework/var_type_inference_test.cc +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) { op->InferVarType(prog.MutableBlock(0)); - ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, + ASSERT_EQ(proto::VarType::LOD_TENSOR, prog.MutableBlock(0)->Var("test2_out")->GetType()); } diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc new file mode 100644 index 0000000000000000000000000000000000000000..c3c5bab23b92a0274cf786ea2f18d8246706162f --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/platform/macros.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +// Besides registering variable type id, it is helpful to register a +// var_id -> std::type_index map (for example, get type names according to id) +namespace detail { + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *id_to_type, MapType2 *type_to_id) { + using Type = + typename std::tuple_element::type; + static_assert(!std::is_same::value, "Type cannot be void"); + constexpr int kId = VarTypeTrait::kId; + auto type = std::type_index(typeid(Type)); + PADDLE_ENFORCE(id_to_type->count(kId) == 0, + "Registered duplicate type id %d for type %s", kId, + type.name()); + PADDLE_ENFORCE(type_to_id->count(type) == 0, + "Registered duplicate type_index %s for id %d", type.name(), + kId); + id_to_type->emplace(kId, type); + type_to_id->emplace(type, kId); + VarIdToTypeIndexMapInitializerImpl::Init(id_to_type, + type_to_id); + } +}; + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *, MapType2 *) {} +}; + +// VarIdToTypeIndexMapInitializer is designed to initialize var_id -> +// std::type_index map and std::type_index -> var_id map +using VarIdToTypeIndexMapInitializer = + VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum, + VarTypeRegistry::kRegisteredTypeNum == + 0>; + +struct VarIdToTypeIndexMapHolder { + DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder); + + public: + static const std::type_index &ToTypeIndex(int var_id) { + auto it = Instance().id_to_type_map_.find(var_id); + PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(), + "VarId %d is not registered.", var_id); + return it->second; + } + + static int ToTypeId(const std::type_index &type) { + auto it = Instance().type_to_id_map_.find(type); + PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(), + "VarType %s is not registered.", type.name()); + return it->second; + } + + private: + VarIdToTypeIndexMapHolder() { + VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_); + } + + static const VarIdToTypeIndexMapHolder &Instance() { + static const VarIdToTypeIndexMapHolder instance; + return instance; + } + + std::unordered_map id_to_type_map_; + std::unordered_map type_to_id_map_; +}; + +} // namespace detail + +const std::type_index &ToTypeIndex(int var_id) { + return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); +} + +const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } + +int ToTypeId(const std::type_index &type) { + return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h new file mode 100644 index 0000000000000000000000000000000000000000..cc68cf2ab8e1bbc8a57cf97a2084610440a75f85 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.h @@ -0,0 +1,195 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include +#ifndef _WIN32 +#include +#endif +#endif + +// Users should add forward declarations here +namespace paddle { + +namespace platform { +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +class Communicator; +#endif +#endif +} // namespace platform + +namespace framework { +class Tensor; +class LoDTensor; +class SelectedRows; +class LoDRankTable; +class ReaderHolder; +class Scope; +} // namespace framework + +namespace operators { +template +class AlgorithmsCache; + +class CudnnRNNCache; + +namespace reader { +class LoDTensorBlockingQueueHolder; +} // namespace reader +} // namespace operators + +} // namespace paddle + +namespace paddle { +namespace framework { + +const char *ToTypeName(int var_id); +const std::type_index &ToTypeIndex(int var_id); +int ToTypeId(const std::type_index &type); + +namespace detail { + +template +struct TypePosFinderImpl { + static constexpr int kPos = + std::is_same::value + ? kStart + : TypePosFinderImpl::kPos; +}; + +template +struct TypePosFinderImpl { + static constexpr int kPos = std::is_same::value ? kStart : -1; +}; + +// TypePosFinder helps to find the position in which T is inside Args... +// If T is not inside Args..., kPos would be -1 +template +struct TypePosFinder { + static constexpr int kPos = + TypePosFinderImpl::kPos; +}; + +template +struct VarTypeRegistryImpl { + static constexpr size_t kRegisteredTypeNum = sizeof...(Args); + using ArgTuple = std::tuple; + + // TypePos() returns the position in which T is inside Args... + // If T is not inside Args..., return -1 + template + static constexpr int TypePos() { + return TypePosFinder::kPos; + } + + // IsRegistered() returns whether T is registered inside RegistryImpl + template + static constexpr bool IsRegistered() { + return TypePos() >= 0; + } +}; + +} // namespace detail + +#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \ + template <> \ + struct VarTypeTrait { \ + static_assert(VarTypeRegistry::IsRegistered(), \ + "Must be registered type"); \ + using Type = type; \ + static constexpr int kId = static_cast(proto_id); \ + } + +/** + * The following codes are designed to register variable types. + * Only registered types can be stored in Variable. + * This registry mechanism is designed to speed up Variable. + * + * Caution: If you want to add more var types, please consider carefully + * whether you really need to add it. + */ + +// Users should add other variable types below. +// Paddle would generate unique Ids for each registered variable types. +using VarTypeRegistry = detail::VarTypeRegistryImpl< + Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, + LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, + std::map, operators::reader::LoDTensorBlockingQueueHolder, +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 + ncclUniqueId, platform::Communicator, +#endif + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::CudnnRNNCache, +#endif + int, float>; + +template +struct VarTypeTrait { + static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); + using Type = T; + /** + * Unique VarType Id generation. + * + * The auto-generated id should not be the same as any protobuf id defined in + * framework.proto. Therefore, we generate id by adding the type pos and + * maximum protobuf id (i.e., proto::VarType::TUPLE). + * + * However, we may need more protobuf id in the future. + * To avoid changing this auto id generation algorithm frequently, we + * generate id by adding the type pos and twice of maximum protobuf id (i.e., + * proto::VarType::TUPLE). + */ + static constexpr int kId = VarTypeRegistry::TypePos() + + static_cast(proto::VarType::TUPLE) * 2; +}; + +// Users should set some of variable type ids to be what is defined in +// framework.proto below +REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR); +REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS); +REG_PROTO_VAR_TYPE_TRAIT(std::vector, proto::VarType::STEP_SCOPES); +REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); +REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); +REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); +REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); +REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); + +/** End of variable type registration */ + +template +inline constexpr bool IsRegisteredVarType() { + return VarTypeRegistry::IsRegistered(); +} + +#undef REG_PROTO_VAR_TYPE_TRAIT +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..00840d634d802cfe17fbff127a75606cb5e2cf79 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *var_id_set, SetType2 *type_index_set) { + using Type = + typename std::tuple_element::type; + static_assert(std::is_same::Type, Type>::value, + "Type must be the same"); + constexpr auto kId = VarTypeTrait::kId; + std::type_index actual_type(typeid(Type)); + EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); + EXPECT_EQ(ToTypeIndex(kId), actual_type); + EXPECT_EQ(ToTypeId(actual_type), kId); + EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type); + EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId); + + EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT + EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT + var_id_set->insert(kId); + type_index_set->insert(std::type_index(typeid(Type))); + TypeIndexChecker::Check(var_id_set, + type_index_set); + } +}; + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *, SetType2 *) {} +}; + +TEST(var_type_traits, check_no_duplicate_registry) { + constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum; + std::unordered_set var_id_set; + std::unordered_set type_index_set; + TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check( + &var_id_set, &type_index_set); +} + +template +bool CheckVarId(int proto_id) { + static_assert(std::is_same::Type, T>::value, + "Type must be the same"); + return VarTypeTrait::kId == proto_id; +} + +TEST(var_type_traits, check_proto_type_id) { + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR)); + ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); + ASSERT_TRUE(CheckVarId>(proto::VarType::STEP_SCOPES)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_RANK_TABLE)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR_ARRAY)); + ASSERT_TRUE(CheckVarId(proto::VarType::PLACE_LIST)); + ASSERT_TRUE(CheckVarId(proto::VarType::READER)); + ASSERT_TRUE(CheckVarId(proto::VarType::INT32)); + ASSERT_TRUE(CheckVarId(proto::VarType::FP32)); + + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR); + ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS); + ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES); + ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE); + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY, + proto::VarType::LOD_TENSOR_ARRAY); + ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST); + ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER); + ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH); + ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST); + ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW); + ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE); + ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32); + ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32); +} + +TEST(var_type_traits, test_registry) { + using Registry = detail::VarTypeRegistryImpl; + ASSERT_TRUE(Registry::TypePos() == 0); + ASSERT_TRUE(Registry::TypePos() == 1); + ASSERT_TRUE(Registry::TypePos() == 2); + ASSERT_TRUE(Registry::TypePos() == 3); + ASSERT_TRUE(Registry::TypePos() == -1); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 873e1b20a584df3ba90cf5c1a62a3879bf98ce5c..b9d07da822cf1eb42859e1d7d84437582fada8ff 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -18,7 +18,7 @@ #include #include -#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/framework/var_type_traits.h" namespace paddle { namespace framework { @@ -27,10 +27,14 @@ class Variable { public: template const T& Get() const { + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing"); - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); return *static_cast(holder_->Ptr()); } @@ -39,61 +43,61 @@ class Variable { template T* GetMutable() { if (!holder_) { - holder_.reset(new PlaceholderImpl(new T())); + holder_.reset(new PlaceholderImpl()); } else { - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); } return static_cast(holder_->Ptr()); } template bool IsType() const { - return holder_ != nullptr && - std::type_index(typeid(T)) == std::type_index(holder_->Type()); + return holder_ && holder_->Type() == VarTypeTrait::kId; } void Clear() { holder_.reset(); } - std::type_index Type() const { + int Type() const { PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory"); return holder_->Type(); } private: struct Placeholder { - virtual ~Placeholder() {} - virtual const std::type_info& Type() const = 0; - virtual void* Ptr() const = 0; + virtual ~Placeholder() = default; + + inline int Type() const { return type_; } + inline const void* Ptr() const { return ptr_; } + inline void* Ptr() { return ptr_; } + + protected: + inline void Init(void* p, int type) { + ptr_ = p; + type_ = type; + } + + void* ptr_; + int type_; }; // Placeholder hides type T, so it doesn't appear as a template // parameter of Variable. template struct PlaceholderImpl : public Placeholder { - explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} - - virtual const std::type_info& Type() const { return type_; } - virtual void* Ptr() const { return static_cast(ptr_.get()); } + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); + PlaceholderImpl() { this->Init(&obj_, VarTypeTrait::kId); } - std::unique_ptr ptr_; - const std::type_info& type_; + private: + T obj_; }; - std::unique_ptr - holder_; // pointers to a PlaceholderImpl object indeed. - - // name_ is only meaningful with a Scope and accessible by it. - // - // NOTE: Please don't expose name_ by adding methods like - // Variable::Name or Scope::VarName! A variable could have a human - // readable name or an auto-generated scope-unique name. In the - // former case, the caller knows the name and doesn't need to access - // the name; in the latter case, the variable should be identified - // by its address but not the unreadable name. - friend class Scope; - const std::string* name_; + // pointers to a PlaceholderImpl object indeed. + std::unique_ptr holder_; }; } // namespace framework diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index 003dcfd3dfe5ecfd563a686bb72b061aff602f73..511c9c52146ece4b90905cc9d49565103589c1ec 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -16,27 +16,28 @@ #include #include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" -TEST(Variable, GetMutable) { - using paddle::framework::Variable; - - struct Tensor { - int content_; - }; +namespace paddle { +namespace framework { +TEST(Variable, GetMutable) { std::unique_ptr v(new Variable()); - Tensor* t = v->GetMutable(); - t->content_ = 1234; + auto* t = v->GetMutable(); + *t = "1234"; - const Tensor& tt = v->Get(); - EXPECT_EQ(1234, tt.content_); + const auto& tt = v->Get(); + EXPECT_EQ("1234", tt); try { - v->GetMutable(); + v->GetMutable(); } catch (std::exception& e) { return; } EXPECT_TRUE(false); } + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index cb88333d1570322fbac7112755bab5e11c97201a..f84e1ab6b827b3b96d0a503394d95b06ed25a3d2 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -69,17 +69,17 @@ void TestWord2vecPrediction(const std::string& model_path) { std::vector outputs; CHECK(predictor->Run(slots, &outputs)); - PADDLE_ENFORCE(outputs.size(), 1UL); + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); // Check the output buffer size and result of each tid. - PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL); float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(static_cast(5UL), num_elements); i++) { - LOG(INFO) << "data: " - << static_cast(outputs.front().data.data())[i]; + LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i] + << " result: " << result[i]; PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], result[i]); } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 83d411eecf6d706615243fd78cb7e4330d904fc1..2db5705d0944b2ab10defdda9a7b616daa8fd47e 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -127,6 +127,7 @@ struct Argument { std::function); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); + DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); // The program transformed by IR analysis phase. DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 51bca8039d4531536cd7a3c39ef8a27f1a5412a1..b8c9426ed3b62d35f78247269cb32d2f6344b092 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument, argument->tensorrt_node_teller_ptr()); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); + pass->Set("min_subgraph_size", + new int(argument->tensorrt_min_subgraph_size())); } // graph_ = pass->Apply(std::move(graph_)); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 9c42b83e7add348433635b1899087324e4e370d4..ad10010e42be9717e3298fc88c89764e4ae2690b 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include #include #include + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" namespace paddle { namespace inference { @@ -36,7 +38,8 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( auto teller = Get("tensorrt_node_teller"); - SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/); + SubGraphFuser fuser(graph.get(), teller, + Get("min_subgraph_size") /*min subgraph size*/); fuser(); for (auto *node : graph->Nodes()) { @@ -197,10 +200,26 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, std::vector ExtractParameters( const std::unordered_set &nodes) { + // We can judge whether a variable is a parameter by + // its presistable property, but sometimes the presistable + // of the feed op output is true, so we have to identify it. + std::vector feed_outputs; + for (const auto &node : nodes) { + if (!node->IsOp()) continue; + std::string op_type = node->Op()->Type(); + if (op_type == "feed") { + std::vector output_names = node->Op()->OutputArgumentNames(); + std::copy(output_names.begin(), output_names.end(), + std::back_inserter(feed_outputs)); + } + } + std::vector parameters; for (const auto &node : nodes) { if (!node->IsVar()) continue; - if (node->Var()->Persistable()) { + if (node->Var()->Persistable() && + std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == + feed_outputs.end()) { parameters.push_back(node->Name()); } } @@ -215,4 +234,5 @@ REGISTER_PASS(tensorrt_subgraph_pass, paddle::inference::analysis::TensorRtSubgraphPass) .RequirePassAttr("tensorrt_node_teller") .RequirePassAttr("max_batch_size") - .RequirePassAttr("workspace_size"); + .RequirePassAttr("workspace_size") + .RequirePassAttr("min_subgraph_size"); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index dcefdd92f5157dce7426f2f3e4a2bc053ce24775..6d6e799fdec9c67b4714f203b91b8bccb61510ba 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -57,6 +57,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; model_from_memory_ = other.model_from_memory_; if (use_gpu) { @@ -89,6 +90,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; model_from_memory_ = other.model_from_memory_; pass_builder_ = std::move(other.pass_builder_); @@ -105,12 +107,14 @@ void contrib::AnalysisConfig::EnableMKLDNN() { } void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, - int max_batch_size) { + int max_batch_size, + int min_subgraph_size) { use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; - // Append after the infer_clean pass. - pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); + tensorrt_min_subgraph_size_ = min_subgraph_size; + // Append after the conv+affine_channel fuse pass. + pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3937884ce4a5a16a1093ac8977033eaa98b2678e..3f8feaaa1e9f91c2ea342ba9227305ad6eb34033 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -328,6 +328,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseTensorRT(true); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); + argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); } if (config_.use_mkldnn_) { diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc index 569a487328e2f1febe2ca5014b232dbd51d28079..03c2aa3fb8094ce2996f513b90589de0ef903ae8 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.cc +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { // TODO(Superjomn) should avoid the case when a TensorArray is a // parameter. if (var_name == "feed" || var_name == "fetch") continue; - if (var->Type() == typeid(framework::LoDTensorArray)) { + if (var->IsType()) { VLOG(4) << "collect " << var_name; arrays_.push_back(var->GetMutable()); } diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index 6a5ea64de66fcac44117d0d8f7798e8875703ec6..213c6891d0e2320689c8c69266d40611f295edc8 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -27,8 +27,11 @@ namespace details { // training phase. struct TensorArrayBatchCleaner { TensorArrayBatchCleaner() { - valid_types_.insert(typeid(framework::Tensor)); - valid_types_.insert(typeid(framework::LoDTensor)); + constexpr auto kTensorId = framework::VarTypeTrait::kId; + constexpr auto kLoDTensorId = + framework::VarTypeTrait::kId; + valid_types_.insert(kTensorId); + valid_types_.insert(kLoDTensorId); } // Collect the variables that are not Tensor or LoDTensor, and reset them to a // bool(trick), because some of them are containers, and some operators just @@ -46,7 +49,7 @@ struct TensorArrayBatchCleaner { bool no_tensor_flag_{true}; std::vector arrays_; - std::unordered_set valid_types_; + std::unordered_set valid_types_; std::unordered_set no_tensor_vars_; }; diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 9a393a61c4b4586009a022884e88f3f5c6392ed3..7830e859567747e6c05686335919e8346f76320d 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -113,6 +113,16 @@ static void TensorAssignData(PaddleTensor *tensor, } } +template +static void TensorAssignData(PaddleTensor *tensor, + const std::vector> &data, + const std::vector &lod) { + int size = lod[lod.size() - 1]; + tensor->shape.assign({size, 1}); + tensor->lod.assign({lod}); + TensorAssignData(tensor, data); +} + template static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, const std::vector> &data) { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index f05b9832da55f10b34eb2df914e443a478e5a4a4..e7ccea6587a250d9d931fa0e85146e32af714d26 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -49,7 +49,7 @@ struct AnalysisConfig : public NativeConfig { bool use_feed_fetch_ops{true}; void EnableTensorRtEngine(int workspace_size = 1 << 20, - int max_batch_size = 1); + int max_batch_size = 1, int min_subgraph_size = 3); bool use_tensorrt() const { return use_tensorrt_; } void EnableMKLDNN(); @@ -69,8 +69,19 @@ struct AnalysisConfig : public NativeConfig { bool use_tensorrt_{false}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; + // For workspace_size, refer it from here: + // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting int tensorrt_workspace_size_; + // While TensorRT allows an engine optimized for a given max batch size + // to run at any smaller size, the performance for those smaller + // sizes may not be as well-optimized. Therefore, Max batch is best + // equivalent to the runtime batch size. int tensorrt_max_batchsize_; + // We transform the Ops that can be converted into TRT layer in the model, + // and aggregate these Ops into subgraphs for TRT execution. + // We set this variable to control the minimum number of nodes in the + // subgraph, 3 as default value. + int tensorrt_min_subgraph_size_{3}; std::unique_ptr pass_builder_; bool model_from_memory_{false}; }; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 40ca0d287ccde113a20abb1036af289a36f54e6c..1062ac5f58b90d8649dae8bacc9ce154b8b9d844 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -118,11 +118,13 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - "conv_bn_fuse_pass", // - "conv_elementwise_add_act_fuse_pass", // - "conv_elementwise_add2_act_fuse_pass", // - "conv_elementwise_add_fuse_pass", // + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // }); } diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 95bbc74a5961eb28a0d8fbd7c680c0740fc68d8a..9aa9db031cd46c9d537bd686f0b23f4c9ae71de6 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -108,6 +108,10 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") +# seq_pool1 +inference_analysis_api_test_with_fake_data(test_analyzer_seq_pool1 +"${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1" analyzer_seq_pool1_tester.cc "seq_pool1.tar.gz") + # mobilenet with depthwise_conv op inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 142801382b4fdeaa63f51390b63cf6db6cb8f60d..2213971c1764b1a0bddfce5830bbdf2ffedd61ee 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -98,10 +98,8 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, auto one_batch = data->NextBatch(); PaddleTensor input_tensor; input_tensor.name = "word"; - input_tensor.shape.assign({static_cast(one_batch.data.size()), 1}); - input_tensor.lod.assign({one_batch.lod}); input_tensor.dtype = PaddleDType::INT64; - TensorAssignData(&input_tensor, {one_batch.data}); + TensorAssignData(&input_tensor, {one_batch.data}, one_batch.lod); PADDLE_ENFORCE_EQ(batch_size, static_cast(one_batch.lod.size() - 1)); input_slots->assign({input_tensor}); } diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 8aaab6d6649e1d4b6db7695df0e9dd219c89422c..9d3c7519430522878ace697ea5ed38aebb6b0855 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -19,11 +19,9 @@ namespace inference { using contrib::AnalysisConfig; struct DataRecord { - std::vector> query_data_all, title_data_all; + std::vector> query, title; std::vector lod1, lod2; - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,22 +31,9 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= query_data_all.size()) { - data.query_data_all.assign(query_data_all.begin() + batch_iter, - query_data_all.begin() + batch_end); - data.title_data_all.assign(title_data_all.begin() + batch_iter, - title_data_all.begin() + batch_end); - // Prepare LoDs - data.lod1.push_back(0); - data.lod2.push_back(0); - CHECK(!data.query_data_all.empty()); - CHECK(!data.title_data_all.empty()); - CHECK_EQ(data.query_data_all.size(), data.title_data_all.size()); - for (size_t j = 0; j < data.query_data_all.size(); j++) { - // calculate lod - data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size()); - data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size()); - } + if (batch_end <= query.size()) { + GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end); } batch_iter += batch_size; return data; @@ -67,8 +52,8 @@ struct DataRecord { // load title data std::vector title_data; split_to_int64(data[1], ' ', &title_data); - query_data_all.push_back(std::move(query_data)); - title_data_all.push_back(std::move(title_data)); + query.push_back(std::move(query_data)); + title.push_back(std::move(title_data)); } num_samples = num_lines; } @@ -80,15 +65,9 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_query_tensor.name = "left"; lod_title_tensor.name = "right"; auto one_batch = data->NextBatch(); - int size1 = one_batch.lod1[one_batch.lod1.size() - 1]; // token batch size - int size2 = one_batch.lod2[one_batch.lod2.size() - 1]; // token batch size - lod_query_tensor.shape.assign({size1, 1}); - lod_query_tensor.lod.assign({one_batch.lod1}); - lod_title_tensor.shape.assign({size2, 1}); - lod_title_tensor.lod.assign({one_batch.lod2}); // assign data - TensorAssignData(&lod_query_tensor, one_batch.query_data_all); - TensorAssignData(&lod_title_tensor, one_batch.title_data_all); + TensorAssignData(&lod_query_tensor, one_batch.query, one_batch.lod1); + TensorAssignData(&lod_title_tensor, one_batch.title, one_batch.lod2); // Set inputs. input_slots->assign({lod_query_tensor, lod_title_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index f19a2ed59ef2f666393124323ffee2f1e79ccf06..f8635968cebc490cf156162efaef4bbbab9dcc3e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -19,11 +19,9 @@ namespace inference { using contrib::AnalysisConfig; struct DataRecord { - std::vector> word_data_all, mention_data_all; + std::vector> word, mention; std::vector lod; // two inputs have the same lod info. - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,20 +31,10 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= word_data_all.size()) { - data.word_data_all.assign(word_data_all.begin() + batch_iter, - word_data_all.begin() + batch_end); - data.mention_data_all.assign(mention_data_all.begin() + batch_iter, - mention_data_all.begin() + batch_end); - // Prepare LoDs - data.lod.push_back(0); - CHECK(!data.word_data_all.empty()); - CHECK(!data.mention_data_all.empty()); - CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size()); - for (size_t j = 0; j < data.word_data_all.size(); j++) { - // calculate lod - data.lod.push_back(data.lod.back() + data.word_data_all[j].size()); - } + if (batch_end <= word.size()) { + GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end); + GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter, + batch_end); } batch_iter += batch_size; return data; @@ -65,8 +53,8 @@ struct DataRecord { // load mention data std::vector mention_data; split_to_int64(data[3], ' ', &mention_data); - word_data_all.push_back(std::move(word_data)); - mention_data_all.push_back(std::move(mention_data)); + word.push_back(std::move(word_data)); + mention.push_back(std::move(mention_data)); } num_samples = num_lines; } @@ -78,14 +66,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_word_tensor.name = "word"; lod_mention_tensor.name = "mention"; auto one_batch = data->NextBatch(); - int size = one_batch.lod[one_batch.lod.size() - 1]; // token batch size - lod_word_tensor.shape.assign({size, 1}); - lod_word_tensor.lod.assign({one_batch.lod}); - lod_mention_tensor.shape.assign({size, 1}); - lod_mention_tensor.lod.assign({one_batch.lod}); // assign data - TensorAssignData(&lod_word_tensor, one_batch.word_data_all); - TensorAssignData(&lod_mention_tensor, one_batch.mention_data_all); + TensorAssignData(&lod_word_tensor, one_batch.word, one_batch.lod); + TensorAssignData(&lod_mention_tensor, one_batch.mention, + one_batch.lod); // Set inputs. input_slots->assign({lod_word_tensor, lod_mention_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index f5082cd60f1ae4e4eaf9dbe59a446ace900ee456..e6d6cd2960b394e8cd20b473bed90ce511f806be 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -18,12 +18,9 @@ namespace paddle { namespace inference { struct DataRecord { - std::vector> title1_all, title2_all, title3_all, l1_all; std::vector> title1, title2, title3, l1; - std::vector title1_lod, title2_lod, title3_lod, l1_lod; - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + std::vector lod1, lod2, lod3, l1_lod; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,41 +30,11 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= title1_all.size()) { - data.title1_all.assign(title1_all.begin() + batch_iter, - title1_all.begin() + batch_end); - data.title2_all.assign(title2_all.begin() + batch_iter, - title2_all.begin() + batch_end); - data.title3_all.assign(title3_all.begin() + batch_iter, - title3_all.begin() + batch_end); - data.l1_all.assign(l1_all.begin() + batch_iter, - l1_all.begin() + batch_end); - // Prepare LoDs - data.title1_lod.push_back(0); - data.title2_lod.push_back(0); - data.title3_lod.push_back(0); - data.l1_lod.push_back(0); - CHECK(!data.title1_all.empty()); - CHECK(!data.title2_all.empty()); - CHECK(!data.title3_all.empty()); - CHECK(!data.l1_all.empty()); - CHECK_EQ(data.title1_all.size(), data.title2_all.size()); - CHECK_EQ(data.title1_all.size(), data.title3_all.size()); - CHECK_EQ(data.title1_all.size(), data.l1_all.size()); - for (size_t j = 0; j < data.title1_all.size(); j++) { - data.title1.push_back(data.title1_all[j]); - data.title2.push_back(data.title2_all[j]); - data.title3.push_back(data.title3_all[j]); - data.l1.push_back(data.l1_all[j]); - // calculate lod - data.title1_lod.push_back(data.title1_lod.back() + - data.title1_all[j].size()); - data.title2_lod.push_back(data.title2_lod.back() + - data.title2_all[j].size()); - data.title3_lod.push_back(data.title3_lod.back() + - data.title3_all[j].size()); - data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size()); - } + if (batch_end <= title1.size()) { + GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end); + GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end); + GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end); } batch_iter += batch_size; return data; @@ -92,10 +59,10 @@ struct DataRecord { // load l1 data std::vector l1_data; split_to_int64(data[3], ' ', &l1_data); - title1_all.push_back(std::move(title1_data)); - title2_all.push_back(std::move(title2_data)); - title3_all.push_back(std::move(title3_data)); - l1_all.push_back(std::move(l1_data)); + title1.push_back(std::move(title1_data)); + title2.push_back(std::move(title2_data)); + title3.push_back(std::move(title3_data)); + l1.push_back(std::move(l1_data)); } num_samples = num_lines; } @@ -109,24 +76,11 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, title3_tensor.name = "title3"; l1_tensor.name = "l1"; auto one_batch = data->NextBatch(); - int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1]; - title1_tensor.shape.assign({title1_size, 1}); - title1_tensor.lod.assign({one_batch.title1_lod}); - int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1]; - title2_tensor.shape.assign({title2_size, 1}); - title2_tensor.lod.assign({one_batch.title2_lod}); - int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1]; - title3_tensor.shape.assign({title3_size, 1}); - title3_tensor.lod.assign({one_batch.title3_lod}); - int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1]; - l1_tensor.shape.assign({l1_size, 1}); - l1_tensor.lod.assign({one_batch.l1_lod}); - // assign data - TensorAssignData(&title1_tensor, one_batch.title1); - TensorAssignData(&title2_tensor, one_batch.title2); - TensorAssignData(&title3_tensor, one_batch.title3); - TensorAssignData(&l1_tensor, one_batch.l1); + TensorAssignData(&title1_tensor, one_batch.title1, one_batch.lod1); + TensorAssignData(&title2_tensor, one_batch.title2, one_batch.lod2); + TensorAssignData(&title3_tensor, one_batch.title3, one_batch.lod3); + TensorAssignData(&l1_tensor, one_batch.l1, one_batch.l1_lod); // Set inputs. input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..2ae840fd11f627d845a20a59ab14118516311d22 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void SetConfig(AnalysisConfig *cfg) { + cfg->param_file = FLAGS_infer_model + "/params"; + cfg->prog_file = FLAGS_infer_model + "/model"; + cfg->use_gpu = false; + cfg->device = 0; + cfg->enable_ir_optim = true; + cfg->specify_input_name = true; + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +void SetInput(std::vector> *inputs) { + std::vector feed_names = { + "slot10000_embed", "slot10001_embed", "slot10004_embed", + "slot10005_embed", "slot10008_embed", "slot10009_embed", + "slot10012_embed", "slot10013_embed", "slot10108_embed", + "slot13324_embed", "slot13325_embed", "slot13326_embed", + "slot13327_embed", "slot13328_embed", "slot13329_embed", + "slot13330_embed", "slot13331_embed", "slot15501_embed", + "slot15502_embed", "slot15503_embed", "slot15504_embed", + "slot15505_embed", "slot15506_embed", "slot15507_embed", + "slot15508_embed", "slot15516_embed", "slot15519_embed", + "slot15523_embed", "slot15531_embed", "slot15533_embed", + "slot15548_embed", "slot15564_embed", "slot15565_embed", + "slot15566_embed", "slot15570_embed", "slot15571_embed", + "slot15572_embed", "slot15573_embed", "slot15574_embed", + "slot15575_embed", "slot15576_embed", "slot15577_embed", + "slot15579_embed", "slot15581_embed", "slot15582_embed", + "slot15583_embed", "slot15584_embed", "slot5016_embed", + "slot5021_embed", "slot6002_embed", "slot6003_embed", + "slot6004_embed", "slot6005_embed", "slot6006_embed", + "slot6007_embed", "slot6008_embed", "slot6009_embed", + "slot6011_embed", "slot6014_embed", "slot6015_embed", + "slot6023_embed", "slot6024_embed", "slot6025_embed", + "slot6027_embed", "slot6029_embed", "slot6031_embed", + "slot6034_embed", "slot6035_embed", "slot6036_embed", + "slot6037_embed", "slot6039_embed", "slot6048_embed", + "slot6050_embed", "slot6058_embed", "slot6059_embed", + "slot6060_embed", "slot6066_embed", "slot6067_embed", + "slot6068_embed", "slot6069_embed", "slot6070_embed", + "slot6071_embed", "slot6072_embed", "slot6073_embed", + "slot6182_embed", "slot6183_embed", "slot6184_embed", + "slot6185_embed", "slot6186_embed", "slot6188_embed", + "slot6189_embed", "slot6190_embed", "slot6201_embed", + "slot6202_embed", "slot6203_embed", "slot6247_embed", + "slot6248_embed", "slot6250_embed", "slot6251_embed", + "slot6807_embed", "slot6808_embed", "slot6809_embed", + "slot6810_embed", "slot6811_embed", "slot6812_embed", + "slot6813_embed", "slot6814_embed", "slot6815_embed", + "slot6816_embed", "slot6817_embed", "slot6818_embed", + "slot6819_embed", "slot6820_embed", "slot6822_embed", + "slot6823_embed", "slot6826_embed", "slot7002_embed", + "slot7003_embed", "slot7004_embed", "slot7005_embed", + "slot7006_embed", "slot7008_embed", "slot7009_embed", + "slot7010_embed", "slot7011_embed", "slot7013_embed", + "slot7014_embed", "slot7015_embed", "slot7016_embed", + "slot7017_embed", "slot7019_embed", "slot7100_embed", + "slot7506_embed", "slot7507_embed", "slot7514_embed", + "slot7515_embed", "slot7516_embed"}; + SetFakeImageInput(inputs, FLAGS_infer_model, true, "model", "params", + &feed_names); +} + +// Easy for profiling independently. +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_seq_pool1, profile) { profile(); } + +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + LOG(INFO) << "num_ops: " << num_ops; + EXPECT_EQ(num_ops, 314); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index b0c8f395ce05fbfceaec3d8b69367292eca714e4..7eb44d9f4ea6e27a504984eac4f960bddc9032e1 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -132,7 +132,8 @@ std::unordered_map GetFuseStatis(PaddlePredictor *predictor, void SetFakeImageInput(std::vector> *inputs, const std::string &dirname, bool is_combined = true, std::string model_filename = "model", - std::string params_filename = "params") { + std::string params_filename = "params", + const std::vector *feed_names = nullptr) { // Set fake_image_data PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); std::vector> feed_target_shapes = GetFeedTargetShapes( @@ -146,29 +147,47 @@ void SetFakeImageInput(std::vector> *inputs, os << "}\n"; } LOG(INFO) << os.str(); - - int dim1 = feed_target_shapes[0][1]; - int dim2 = feed_target_shapes[0][2]; - int dim3 = feed_target_shapes[0][3]; - - PaddleTensor input; - std::vector shape({FLAGS_batch_size, dim1, dim2, dim3}); - input.shape = shape; - input.dtype = PaddleDType::FLOAT32; - - // fill input data, for profile easily, do not use random data here. - size_t size = FLAGS_batch_size * dim1 * dim2 * dim3; - input.data.Resize(size * sizeof(float)); - float *input_data = static_cast(input.data.data()); - for (size_t i = 0; i < size; i++) { - *(input_data + i) = static_cast(i) / size; + if (feed_names) { + PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size()); + } + std::vector input_slots(feed_target_shapes.size()); + for (size_t i = 0; i < feed_target_shapes.size(); ++i) { + const auto &feed_shape = feed_target_shapes[i]; + auto &input = input_slots[i]; + std::vector shape({FLAGS_batch_size}); + for (size_t s = 1; s < feed_shape.size(); ++s) { + shape.push_back(static_cast(feed_shape[s])); + } + if (feed_names) { + input.name = (*feed_names)[i]; + } + input.shape = shape; + input.dtype = PaddleDType::FLOAT32; + size_t len = std::accumulate(shape.begin(), shape.end(), 1, + [](int a, int b) { return a * b; }); + input.data.Resize(len * sizeof(float)); + input.lod.assign({{0, static_cast(FLAGS_batch_size)}}); + float *input_data = static_cast(input.data.data()); + // fill input data, for profile easily, do not use random data here. + for (size_t j = 0; j < len; ++j) { + *(input_data + j) = static_cast(j) / len; + } } - - std::vector input_slots; - input_slots.assign({input}); (*inputs).emplace_back(input_slots); } +void GetInputPerBatch(const std::vector> &in, + std::vector> *out, + std::vector *lod, size_t batch_iter, + size_t batch_end) { + lod->clear(); + lod->push_back(0); + for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) { + out->push_back(*it); + lod->push_back(lod->back() + (*it).size()); // calculate lod + } +} + void TestOneThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index ab3a30ce6bba14a7d5ec700a159d90031e6b5dc7..29f0f034a2aab50330d4d0127b870a5cb00d56a5 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -3,14 +3,16 @@ set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING "A path setting inference demo download directories.") function (inference_download install_dir url filename) message(STATUS "Download inference test stuff from ${url}/${filename}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") + file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}") message(STATUS "finish downloading ${filename}") endfunction() function (inference_download_and_uncompress install_dir url filename) inference_download(${install_dir} ${url} ${filename}) - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") + execute_process( + COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename} + WORKING_DIRECTORY ${install_dir} + ) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4a14eb941cd98e333a3e85aff064e6099b3be396..ee154207754f20791bdba55d0d66aea600e8dee6 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -46,7 +46,7 @@ endif() register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) # warpctc_op needs cudnn 7 above -if (WITH_GPU AND NOT WIN32) +if (WITH_GPU) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 855c4d70677395992e2bf685c910cbea2d37b20b..49e734ce96b0d38b59102575250a020e6924362a 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -64,7 +64,7 @@ class ClipByNormKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); } else { PADDLE_THROW("Unexpected branch, input variable type is %s", - in_var->Type().name()); + framework::ToTypeName(in_var->Type())); } PADDLE_ENFORCE_NOT_NULL(input); diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 48800947fd387bf4d84a85e82fdcd7efa3f08de5..0360cf5273591946570cac47e2578e43f498b550 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -175,14 +175,13 @@ class WhileGradOp : public framework::OperatorBase { auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name), "Cannot find inside gradient %s", inside_og_name); - if (framework::IsType(og_outside.Type())) { + if (og_outside.IsType()) { auto &outside_tensor = og_outside.Get(); auto &inside_tensor = detail::Ref(og_inside.GetMutable()); inside_tensor.set_lod(outside_tensor.lod()); inside_tensor.ShareDataWith(outside_tensor); - } else if (framework::IsType( - og_outside.Type())) { + } else if (og_outside.IsType()) { auto &outside_array = og_outside.Get(); auto &inside_array = detail::Ref(og_inside.GetMutable()); @@ -256,7 +255,7 @@ class WhileGradOp : public framework::OperatorBase { var->IsType(), "Currently the type of var only can be LoDTensorArray, " "or LoDTensor, but the received var[%s] is %s.", - inside_grad_name, var->Type().name()); + inside_grad_name, framework::ToTypeName(var->Type())); if (var->IsType()) { auto &inside_tensor = var->Get(); diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 3235ad52b999e1ca3f992034781edaab9921a300..e73762f5fb2386633212c5aa9fc768153cf63f85 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -22,7 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search); namespace paddle { namespace operators { -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; @@ -161,9 +161,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - if ((activation == "identity") && - (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) && - (!residual)) { + if ((activation == "identity") && (!residual)) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. // But test in some case, the speed is slower, change to use @@ -204,7 +202,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel, ops::CUDNNConvFusionOpKernel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index f2ba75485c58789de848b8833a1a527d45ced83c..fae0925149146dc06c50e0a9ff61b1686a9b7b5c 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { @@ -22,239 +22,6 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -struct CudnnRNNCache { - CudnnRNNCache() { - x_desc_ = NULL; - y_desc_ = NULL; - dx_desc_ = NULL; - dy_desc_ = NULL; - } - ~CudnnRNNCache() { release(); } - - cudnnRNNDescriptor_t rnn_desc_; - cudnnTensorDescriptor_t *x_desc_; - cudnnTensorDescriptor_t *y_desc_; - cudnnTensorDescriptor_t *dx_desc_; - cudnnTensorDescriptor_t *dy_desc_; - - cudnnTensorDescriptor_t hx_desc_; - cudnnTensorDescriptor_t cx_desc_; - cudnnTensorDescriptor_t hy_desc_; - cudnnTensorDescriptor_t cy_desc_; - - cudnnTensorDescriptor_t dhx_desc_; - cudnnTensorDescriptor_t dcx_desc_; - cudnnTensorDescriptor_t dhy_desc_; - cudnnTensorDescriptor_t dcy_desc_; - - cudnnTensorDescriptor_t output_x_desc_; - cudnnTensorDescriptor_t output_y_desc_; - - cudnnDropoutDescriptor_t dropout_desc_; - - size_t weights_size_; - cudnnFilterDescriptor_t w_desc_; - cudnnFilterDescriptor_t dw_desc_; - - size_t workspace_size_; - size_t reserve_size_; - Tensor reserve_data_; - Tensor workspace_data_; - - Tensor dropout_state_; - - size_t max_length_; - - float dropout_prob_; - bool is_bidirec_; - - int batch_size_; - int input_size_; - int hidden_size_; - int num_layers_; - int seed_; - - void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx, - size_t max_len, int batch_size, int input_size, int hidden_size, - int num_layers, float dropout_prob, bool is_bidirec, int seed, - int weight_numel) { - max_length_ = max_len; - batch_size_ = batch_size; - input_size_ = input_size; - hidden_size_ = hidden_size; - num_layers_ = num_layers; - dropout_prob_ = dropout_prob; - is_bidirec_ = is_bidirec; - seed_ = seed; - - x_desc_ = new cudnnTensorDescriptor_t[max_length_]; - y_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; - int dim_a[3]; - int stride_a[3]; - - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); - dim_a[0] = batch_size_; - dim_a[1] = input_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - dim_a[0] = batch_size_; - dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - } - - dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); - dim_a[1] = batch_size_; - dim_a[2] = hidden_size_; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - CUDNN_ENFORCE( - platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); - - size_t state_size; - CUDNN_ENFORCE( - platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); - dropout_state_.Resize({static_cast(state_size)})); - auto *dropout_state_data = - dropout_state_.mutable_data(ctx.GetPlace()); - CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( - dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, - seed_)); - - CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); - -#if CUDNN_VERSION >= 6000 - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( - handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, - CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); -#else - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_DATA_FLOAT)); -#endif - - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( - handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); - - PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, - "cudnn lstm weight size should be SAME"); - int dim_w[3]; - dim_w[0] = weights_size_ / sizeof(float); - dim_w[1] = 1; - dim_w[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( - handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( - handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); - - reserve_data_.Resize({static_cast(reserve_size_)}); - reserve_data_.mutable_data(ctx.GetPlace()); - - workspace_data_.Resize({static_cast(workspace_size_)}); - workspace_data_.mutable_data(ctx.GetPlace()); - } - - void release() { - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); - } - - delete[] x_desc_; - delete[] y_desc_; - delete[] dx_desc_; - delete[] dy_desc_; - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); - - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); - } -}; - template class CudnnLSTMGPUKernel : public framework::OpKernel { public: @@ -315,9 +82,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { auto input_w_numel = w->numel(); auto batch_size = x->dims()[1]; - cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, - hidden_size, num_layers, dropout_prob, is_bidirec, - seed, input_w_numel); + cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size, + input_size, hidden_size, num_layers, dropout_prob, + is_bidirec, seed, input_w_numel); } auto run_seq_len = x->dims()[0]; diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..7f18b839271a29523cc06c999c28cc0394717397 --- /dev/null +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -0,0 +1,255 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +struct CudnnRNNCache { + CudnnRNNCache() { + x_desc_ = NULL; + y_desc_ = NULL; + dx_desc_ = NULL; + dy_desc_ = NULL; + } + ~CudnnRNNCache() { release(); } + + cudnnRNNDescriptor_t rnn_desc_; + cudnnTensorDescriptor_t *x_desc_; + cudnnTensorDescriptor_t *y_desc_; + cudnnTensorDescriptor_t *dx_desc_; + cudnnTensorDescriptor_t *dy_desc_; + + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnTensorDescriptor_t dhx_desc_; + cudnnTensorDescriptor_t dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_; + cudnnTensorDescriptor_t dcy_desc_; + + cudnnTensorDescriptor_t output_x_desc_; + cudnnTensorDescriptor_t output_y_desc_; + + cudnnDropoutDescriptor_t dropout_desc_; + + size_t weights_size_; + cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t dw_desc_; + + size_t workspace_size_; + size_t reserve_size_; + framework::Tensor reserve_data_; + framework::Tensor workspace_data_; + + framework::Tensor dropout_state_; + + size_t max_length_; + + float dropout_prob_; + bool is_bidirec_; + + int batch_size_; + int input_size_; + int hidden_size_; + int num_layers_; + int seed_; + + void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len, + int batch_size, int input_size, int hidden_size, int num_layers, + float dropout_prob, bool is_bidirec, int seed, int weight_numel) { + max_length_ = max_len; + batch_size_ = batch_size; + input_size_ = input_size; + hidden_size_ = hidden_size; + num_layers_ = num_layers; + dropout_prob_ = dropout_prob; + is_bidirec_ = is_bidirec; + seed_ = seed; + + x_desc_ = new cudnnTensorDescriptor_t[max_length_]; + y_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; + int dim_a[3]; + int stride_a[3]; + + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); + dim_a[0] = batch_size_; + dim_a[1] = input_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + dim_a[0] = batch_size_; + dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + } + + dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); + dim_a[1] = batch_size_; + dim_a[2] = hidden_size_; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); + + size_t state_size; + CUDNN_ENFORCE( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); + dropout_state_.Resize({static_cast(state_size)})); + auto *dropout_state_data = dropout_state_.mutable_data(place); + CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, + seed_)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + +#if CUDNN_VERSION >= 6000 + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); +#else + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_DATA_FLOAT)); +#endif + + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); + + PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, + "cudnn lstm weight size should be SAME"); + int dim_w[3]; + dim_w[0] = weights_size_ / sizeof(float); + dim_w[1] = 1; + dim_w[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + + reserve_data_.Resize({static_cast(reserve_size_)}); + reserve_data_.mutable_data(place); + + workspace_data_.Resize({static_cast(workspace_size_)}); + workspace_data_.mutable_data(place); + } + + void release() { + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); + } + + delete[] x_desc_; + delete[] y_desc_; + delete[] dx_desc_; + delete[] dy_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 999fdcff90784ed089cd620a4f0a908f196bcdda..7c0fda4169b5e1cf663d04b78b6425d73965c292 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h index a800d5df0a7cbc668a0217350098bce2bfdcfa70..8660bc219c12fe8676e3be646c4b878a39700201 100644 --- a/paddle/fluid/operators/detail/safe_ref.h +++ b/paddle/fluid/operators/detail/safe_ref.h @@ -25,7 +25,7 @@ namespace detail { */ template inline T& Ref(T* ptr, ARGS&&... args) { - PADDLE_ENFORCE(ptr != nullptr, args...); + PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...)); return *ptr; } diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h index d2b0eb6ca6de1984dc7cfc2a662c88d5e56e1e05..27ca1f4edc04f5fca54b1a6340243634a596939c 100644 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -84,7 +84,9 @@ class ProtoEncodeHelper { ~ProtoEncodeHelper() { #define REPLACE_ENFORCE_GLOG 1 // Make sure callers didn't do operations that went over max_size promised - paddle::platform::throw_on_error(p_ <= limit_); + if (paddle::platform::is_error(p_ <= limit_)) { + paddle::platform::throw_on_error(p_ <= limit_); + } #undef REPLACE_ENFORCE_GLOG } diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 3c0b7ff24f9cf0b128d32aa185e5e3cb47d9c4b9..a8bb597cbd59290df1347c164d37104c6ac431e9 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -33,7 +33,7 @@ register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS}) if(WITH_GPU AND NOT WIN32) set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common) - op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common) + op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common) endif() set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h index acc9b1e6227942781db61a3bc50b2ac95865f79c..6676ecd1c85d70cd5961af2fb1537e77b10e41bc 100644 --- a/paddle/fluid/operators/distributed_ops/split_ids_op.h +++ b/paddle/fluid/operators/distributed_ops/split_ids_op.h @@ -116,7 +116,7 @@ class SplitIdsOpKernel : public framework::OpKernel { } else { PADDLE_THROW( "% should be LoDTensor or SelectedRows, but the received type is %s", - ctx.Inputs("Ids")[0], ids_var->Type().name()); + ctx.Inputs("Ids")[0], framework::ToTypeName(ids_var->Type())); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index a8b8a67a114b956f2d6b1b072ef343a179114b34..7a7a3989c047ae379cc14e2f783662db99239445 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -83,7 +83,7 @@ class ElementwiseMulKernel : public framework::OpKernel { z = ctx.Output("Out"); } else { PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - x_var->Type().name()); + framework::ToTypeName(x_var->Type())); } z->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 0a18882e8199c2a375a230a693b8b01d12aabfa0..4e4f977fcc742856b877ef0b7f9a3cc9879aefce 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -50,8 +50,8 @@ template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + const bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); @@ -132,8 +132,8 @@ template class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + const bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); PADDLE_ENFORCE( diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h index 6c616aa03d9809e9b7725a700c7edd5ff5d6dc42..3f51bb0b3d6ddf41a08a64f254f76c88b60ced22 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.h +++ b/paddle/fluid/operators/optimizers/adadelta_op.h @@ -27,12 +27,14 @@ class AdadeltaOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto avg_squared_grad_out_tensor = diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h index 9f6ef391696aa8718be71ae945e746b876813d94..13455fc42cdc72a8ebfcac3dc0c94b79497d91f6 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.h +++ b/paddle/fluid/operators/optimizers/adagrad_op.h @@ -50,7 +50,8 @@ class AdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto *param_out_tensor = ctx.Output("ParamOut"); auto *moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 1138bb7400e0e7a00983e7bfaad2b2d9704b77ab..5c559484ec95e794ebbbe0e713cb9e26b5c01b98 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -347,7 +347,8 @@ class AdamOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h index 7137fbd9651b4523f6d1609a0595b30758aa40df..55d25ecbddf175c0c9ba2c68ef2f6c7b83dcf32e 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.h +++ b/paddle/fluid/operators/optimizers/adamax_op.h @@ -27,12 +27,14 @@ class AdamaxOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h index 5df43d33ef9f720fd20d57c53ff37cc85440b24e..4abd436927707f1a18039c9104a92b2a0bf3c982 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h @@ -27,12 +27,14 @@ class DecayedAdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index 8f812c9a037bfac8c1e29e32a5ad5b077c8153d1..bbf34d8316b09a78c334b0d79b132639be8af4f7 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -32,12 +32,14 @@ class FTRLOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto* param_out = ctx.Output("ParamOut"); auto* sq_accum_out = ctx.Output("SquaredAccumOut"); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index f6ef83c3bad23d709b386f8e75bbc97fa9ba0aab..3ed1bff5ff4993e9c858dea8d56a8cb6124aca89 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -395,7 +395,7 @@ class MomentumOpKernel : public framework::OpKernel { PADDLE_THROW( string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows " "gradient, but the received Variable Type is %s", - grad_var->Type().name())); + framework::ToTypeName(grad_var->Type()))); } } }; diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index a9d303d55d8f681fe3a014db36ede5ef6b2742bd..975e4b8e7212bc61d5df0ca350fcf12afb463cba 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -60,7 +60,8 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto* param = ctx.Input("Param"); auto* param_out = ctx.Output("ParamOut"); diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index 767449cde981e5925b7144ff1038560c67651f3e..5ede972c71ff3ef8ff00756b97662aabb54d6349 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -63,7 +63,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { } auto *mask_data = cpu_mask->data(); - std::vector> copy_ranges(mask_dim[0]); + std::vector> copy_ranges(2); // set out_true/out_false lod for (size_t t = 0; t < 2; t++) { diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc index f9a16ef35ecb9eeb6c8eda9d124ecb17e7f9d5ce..c39f94637a1abb5bfce9a5428419282f2b870c91 100644 --- a/paddle/fluid/operators/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/sum_mkldnn_op.cc @@ -245,7 +245,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 4f717a43551d6d79292bd1d49664d35588a8ec3a..01996e6bf975227270914aa6bec26aacfc814c94 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -126,7 +126,7 @@ class SumOp : public framework::OperatorWithKernel { PADDLE_THROW("Cannot find the input data type by all input data"); } PADDLE_THROW("Unexpected branch. Input type is %s", - x_vars[0]->Type().name()); + framework::ToTypeName(x_vars[0]->Type())); } }; diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 76cc796a9b8e21849b1d86e512cd70752fd027ac..a8b2df186dbfcb2a913e9532e2a475f1ad0d23a1 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -163,7 +163,7 @@ class SumKernel : public framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 01ee67fd07f848356e801be95d53a61bb5b08e37..06680539507886becb751d02cb02abbf702f1948 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -140,68 +140,72 @@ struct EOFException : public std::exception { #define LIKELY(condition) (condition) #endif +inline bool is_error(bool stat) { return !stat; } + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { - if (UNLIKELY(!(stat))) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(string::Sprintf(args...)); + throw std::runtime_error(string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } } #ifdef PADDLE_WITH_CUDA +inline bool is_error(cudaError_t e) { return UNLIKELY(e); } + template inline typename std::enable_if::type throw_on_error( cudaError_t e, const Args&... args) { - if (UNLIKELY(e)) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( curandStatus_t stat, const Args&... args) { - if (stat != CURAND_STATUS_SUCCESS) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cudnnStatus_t stat) { + return stat != CUDNN_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cudnnStatus_t stat, const Args&... args) { - if (stat == CUDNN_STATUS_SUCCESS) { - return; - } else { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cublasStatus_t stat) { + return stat != CUBLAS_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cublasStatus_t stat, const Args&... args) { std::string err; - if (stat == CUBLAS_STATUS_SUCCESS) { - return; - } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { err = "CUBLAS: not initialized, "; } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { err = "CUBLAS: alloc failed, "; @@ -254,21 +258,49 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) +#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; + +#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \ + ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG)); + +#define __PADDLE_THROW_ON_ERROR(COND, ...) \ + __PADDLE_THROW_ERROR_I( \ + __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)) + +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \ + } \ + } while (0) + #ifndef REPLACE_ENFORCE_GLOG -#define PADDLE_ENFORCE(...) \ +#define __PADDLE_ENFORCE_I(COND, ...) \ do { \ try { \ - ::paddle::platform::throw_on_error(__VA_ARGS__); \ + __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ } \ - } while (false) + } while (0) #else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); +#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG +#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args +#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__)) + #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index d52182965552e9ec945cb7d0b421d8addcb758e9..1091badae54a809c4a9da6d0398bcbb538420af0 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -37,6 +37,25 @@ TEST(ENFORCE, FAILED) { HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all")); } EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false, "Enforce is not ok at all"); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "Enforce is not ok at all")); + } + EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_NE(std::string(error.what()).find(" at "), 0); + } + EXPECT_TRUE(caught_exception); } TEST(ENFORCE, NO_ARG_OK) { diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index 27e930e6e0a76982b3f27619f38a4a08d82cafa1..3a937dfaec3acc7c116f0077694e9aee1b379061 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/init.h" diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index e2b7ca9b03809113c31af8ff4d3ad3713748f330..b1b51d804e02f233bcd16149005092dc80e9c79d 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -11,6 +11,7 @@ limitations under the License. */ #include "paddle/fluid/platform/float16.h" +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include #include diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index fb8bcb190bda59e23d118547f451be46c963cce9..72b0f216d3aafbb95931590935b0bf967a8d5be8 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,5 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) endif() diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index f8ded9f94ecaf3df1e14aead60ae12abcf8c34a9..06d8b65fb1480d9f621ca937c1d66ab7e910f010 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -49,9 +49,6 @@ void BindConstValue(pybind11::module* m) { op_proto_and_checker_maker.def( "kOpNameScopeAttrName", framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); - op_proto_and_checker_maker.def( - "kOpCreationCallstackAttrName", - framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName); } } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 45de9c980b874e3ccc8ec177310ebc4276bf6407..3bb08cbeb70b5a713c6bddf335d44091eed1b153 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -32,6 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope_pool.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" @@ -117,6 +118,9 @@ PYBIND11_MODULE(core, m) { return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); }); + m.add_object("_cleanup", + py::capsule([]() { ScopePool::Instance().Clear(); })); + py::class_(m, "VarBase", R"DOC()DOC") .def(py::init<>()) .def("_run_backward", @@ -454,7 +458,7 @@ All parameter, weight, gradient are variables in Paddle. }, py::return_value_policy::copy); - py::class_(m, "Scope", R"DOC( + py::class_(m, "_Scope", R"DOC( Scope is an association of a name to Variable. All variables belong to Scope. Variables in a parent scope can be retrieved from local scope. @@ -474,17 +478,26 @@ All parameter, weight, gradient are variables in Paddle. param.set(param_array, place) )DOC") + .def("_remove_from_pool", + [](Scope &self) { ScopePool::Instance().Remove(&self); }) .def("var", [](Scope &self, const std::string &name) -> Variable * { return self.Var(name); }, py::return_value_policy::reference) .def("find_var", &Scope::FindVar, py::return_value_policy::reference) - .def(py::init<>()) .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, py::return_value_policy::reference) .def("drop_kids", &Scope::DropKids); + m.def("Scope", + []() -> Scope * { + auto *s = new Scope(); + ScopePool::Instance().Insert(std::unique_ptr(s)); + return s; + }, + py::return_value_policy::reference); + //! @note: Be careful! PyBind will return std::string as an unicode, not //! Python str. If you want a str object, you should cast them in Python. m.def("get_all_op_protos", []() -> std::vector { diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index a2eec6e3c48dd126614bbff0227145537b678ac4..0b94b60018aac3a61edfda4d7ecb762e9fe70673 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -87,7 +87,7 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { template std::string Sprintf(const Args&... args) { std::ostringstream oss; - Fprintf(oss, ""); + Fprintf(oss, "%s", args...); return oss.str(); } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 418dc1346840f707f38836e09a14ca2c19c063ff..1220f801007854c444ab9852d6fc24f67fe95e4b 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - #================================================= # Utils #================================================= @@ -418,13 +417,6 @@ EOF else ctest --output-on-failure fi - - # make install should also be test when unittest - make install -j `nproc` - pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl - if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then - paddle version - fi fi } @@ -922,6 +914,7 @@ function main() { ;; assert_api) assert_api_not_changed ${PYTHON_ABI:-""} + assert_api_spec_approvals ;; test_inference) gen_capi_package @@ -946,6 +939,15 @@ function main() { run_test assert_api_not_changed ${PYTHON_ABI:-""} ;; + cmake_gen) + cmake_gen ${PYTHON_ABI:-""} + ;; + gen_fluid_lib) + gen_fluid_lib + ;; + test_fluid_lib) + test_fluid_lib + ;; *) print_usage exit 0 diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8f3660ca387ba10309f829bd04ac2ffdc573f3d6..e0078e53141ac7834fd00e4f2dbd8a6c8a1d6b1b 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -46,7 +46,7 @@ from . import transpiler from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder -from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope +from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig from .lod_tensor import create_lod_tensor, create_random_int_lodtensor diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index f2886090d75f87654b33cf7aa6f98ebf6f2e27d1..5a9e908b61eeeea3fdfdfcc54d1f150f59a3973b 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -191,7 +191,7 @@ def _fetch_var(name, scope=None, return_numpy=True): assert isinstance(name, str) if scope is None: scope = global_scope() - assert isinstance(scope, core.Scope) + assert isinstance(scope, core._Scope) var = scope.find_var(name) assert var is not None, ( diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3427fb0c4ae4dad0323b39cf85bfb5b04f796996..de30ed2fc5858187d2ecede299832701304e4198 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -20,7 +20,6 @@ import os import re import six import sys -import traceback import numpy as np @@ -605,10 +604,6 @@ class Operator(object): if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: del op_attrs[role_var_name] - callstack_var_name = op_maker.kOpCreationCallstackAttrName() - op_attrs[callstack_var_name] = list( - reversed(traceback.format_stack()))[1:] - if len(self.desc.type()) != 0: return if type is None: diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 74cf76da951a4cea884c4fdb8591b3d4fb010300..c97a93ec36d4f4a7ff6a9f097551e2d21022d5b1 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -148,7 +148,7 @@ class ParallelExecutor(object): trainers_endpoints), "num_trainers == len(end_points)" build_strategy.trainers_endpoints = trainers_endpoints - # step5: get persistable_vars, parameter_vars, places. persistable_vars + # step6: get persistable_vars, places. persistable_vars # need be broadcast to other local_scope. persistable_vars = set([ cpt.to_text(v.name) for v in [ @@ -164,7 +164,7 @@ class ParallelExecutor(object): places = list(map(place_obj, self._places)) - # step6: init ParallelExecutor + # step7: init ParallelExecutor self.executor = core.ParallelExecutor( places, persistable_vars, main.desc, cpt.to_text(loss_name) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index d744a00242422defb360590b193e07c6f811dcb9..e87c1d58c812f3186658a78c4280f16d34c466ee 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -185,8 +185,10 @@ def main(use_cuda, parallel): if __name__ == '__main__': - for use_cuda in (False, True): - for parallel in (False, True): - if use_cuda and not core.is_compiled_with_cuda(): - continue - main(use_cuda=use_cuda, parallel=parallel) + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + if not on_ci: + for use_cuda in (False, True): + for parallel in (False, True): + if use_cuda and not core.is_compiled_with_cuda(): + continue + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c2a4e5ca0c050813785f602c5d2088466e616971..28602d3251a36130bfcfdda406aa85673e1ad4c7 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -15,6 +15,18 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase +import os + + +def skip_ci(func): + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + + def __func__(*args, **kwargs): + if on_ci: + return + return func(*args, **kwargs) + + return __func__ class TestDistSeResneXt2x2(TestDistBase): @@ -22,6 +34,7 @@ class TestDistSeResneXt2x2(TestDistBase): self._sync_mode = True self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) @@ -32,6 +45,7 @@ class TestDistseResnXt2x2WithMemopt(TestDistBase): self._mem_opt = True self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) @@ -41,6 +55,7 @@ class TestDistSeResneXt2x2Async(TestDistBase): self._sync_mode = False self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100) diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 37b9a9188ab44df81029ae6d9925ae21c1929cff..4153394c1da776d0a41e1415a09fa7d6f4b14d6d 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase): set(mul_op.attr_names), set([ "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", - "op_namescope", "op_callstack" + "op_namescope" ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index 943ad3ed22480193dc51375cdcca5ed36ce35158..655378f7f8c18f5936643f6c178d0e6b3dd8cac8 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -26,7 +26,7 @@ os.environ['CPU_NUM'] = str(dev_cnt) def dummy_func_with_no_input(): - return float(1.0) + return np.array([0], dtype='float32') def dummy_func_with_no_output(x): @@ -105,7 +105,7 @@ def simple_fc_net(img, label, use_py_func_op): name='test_tmp_var', dtype='float32', shape=[1]) fluid.layers.py_func( func=dummy_func_with_no_input, x=None, out=dummy_var) - + loss += dummy_var fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None) loss = fluid.layers.mean(loss) @@ -174,7 +174,7 @@ class TestPyFuncOpUseExecutor(unittest.TestCase): self.assertAlmostEqual(max_diff, 0, delta=1e-3) -class TestPyFuncOpUseParallelExecutor(unittest.TestCase): +class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor): def setUp(self): self.use_parallel_executor = True diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py new file mode 100644 index 0000000000000000000000000000000000000000..f37d2bfb2e86b452cf7fd05c3e5871de2e33d629 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py @@ -0,0 +1,188 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import contextlib + +import unittest +from functools import partial +import numpy as np +import paddle +import paddle.fluid.core as core + +import paddle.fluid as fluid + + +def get_places(): + places = [] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + +@contextlib.contextmanager +def prog_scope_guard(main_prog, startup_prog): + scope = fluid.core.Scope() + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + with fluid.program_guard(main_prog, startup_prog): + yield + + +def bow_net(data, + label, + dict_dim, + is_sparse=False, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestWeightDecay(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + reader = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), batch_size=4)() + self.train_data = [next(reader) for _ in range(5)] + self.learning_rate = .5 + + def run_executor(self, place, feed_list, loss): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + main_prog = fluid.default_main_program() + loss_set = [] + for data in self.train_data: + out = exe.run(main_prog, + feed=feeder.feed(data), + fetch_list=[loss.name]) + + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def run_parallel_exe(self, + place, + feed_list, + loss, + use_cuda=True, + use_reduce=False, + use_fast_executor=False, + use_ir_memory_optimize=False): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + + exec_strategy = fluid.ExecutionStrategy() + if use_fast_executor: + exec_strategy.use_experimental_executor = True + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.memory_optimize = use_ir_memory_optimize + + parallel_exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + + loss_set = [] + for data in self.train_data: + out = parallel_exe.run(feed=feeder.feed(data), + fetch_list=[loss.name]) + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def check_weight_decay(self, + place, + model, + use_parallel_exe=False, + use_reduce=False): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog): + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost = model(data, label, len(self.word_dict)) + + param_list = [(var, var * self.learning_rate) + for var in main_prog.block(0).all_parameters()] + + optimizer = fluid.optimizer.Adagrad( + learning_rate=self.learning_rate) + + optimizer.minimize(avg_cost) + + for params in param_list: + updated_p = fluid.layers.elementwise_sub( + x=params[0], y=params[1]) + fluid.layers.assign(input=updated_p, output=params[0]) + + if use_parallel_exe: + loss = self.run_parallel_exe( + place, [data, label], + loss=avg_cost, + use_cuda=True, + use_reduce=use_reduce) + else: + loss = self.run_executor(place, [data, label], loss=avg_cost) + + return loss + + def test_weight_decay(self): + model = partial(bow_net, is_sparse=False) + for place in get_places(): + loss = self.check_weight_decay(place, model, use_parallel_exe=False) + + loss2 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=False) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss2[i], rtol=5e-5) + + loss3 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=True) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss3[i], rtol=5e-5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index ccf7af334d091dff5cf6a94e5875cdf582fa0c5c..cc7f5ec90c26c87b7c5514c900e853be9e16d6eb 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -57,7 +57,7 @@ class InferenceTranspiler(object): raise TypeError("place should be as CPUPlace/CUDAPlace type") if scope is None: scope = global_scope() - if not isinstance(scope, core.Scope): + if not isinstance(scope, core._Scope): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))