diff --git a/CMakeLists.txt b/CMakeLists.txt index 68447727118a91a2a8c0d06404353c7ccb734c6d..48e52961a95d50264b201eec50ccb3a462f39c54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,6 +204,11 @@ include(external/snappy) # download snappy include(external/snappystream) include(external/threadpool) +include(flags) # set paddle compile flags +include(cudnn) # set cudnn libraries, must before configure +include(cupti) +include(configure) # add paddle env configuration + if(WITH_GPU) include(cuda) include(tensorrt) @@ -212,15 +217,11 @@ elseif() set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE) endif() -include(cudnn) # set cudnn libraries, must before configure -include(cupti) -include(configure) # add paddle env configuration include(generic) # simplify cmake module include(package) # set paddle packages include(ccache) # set ccache for compilation include(util) # set unittest and link libs include(rdma) # set rdma libraries -include(flags) # set paddle compile flags include(version) # set PADDLE_VERSION include(coveralls) # set code coverage include(inference_lib) # add paddle fluid inference libraries diff --git a/cmake/configure.cmake b/cmake/configure.cmake index d14162e0a662afe63152bfc2132e5dfd54f5a86c..e03e15bfc017ce33e06192a7fa8010ffe060adcb 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -50,16 +50,16 @@ if(NOT WITH_PROFILER) endif(NOT WITH_PROFILER) if(NOT CMAKE_CROSSCOMPILING) - if(WITH_AVX AND AVX_FOUND) + if(WITH_AVX AND AVX512F_FOUND) + set(SIMD_FLAG ${AVX512F_FLAG}) + elseif(WITH_AVX AND AVX2_FOUND) + set(SIMD_FLAG ${AVX2_FLAG}) + elseif(WITH_AVX AND AVX_FOUND) set(SIMD_FLAG ${AVX_FLAG}) elseif(SSE3_FOUND) set(SIMD_FLAG ${SSE3_FLAG}) endif() endif() -if(UNIX AND NOT APPLE) - # except apple from nix*Os family - set(LINUX TRUE) -endif(UNIX AND NOT APPLE) if(NOT WITH_GOLANG) add_definitions(-DPADDLE_WITHOUT_GOLANG) @@ -103,15 +103,20 @@ if(WITH_GPU) endif() if(WITH_ANAKIN) if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) - message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile") + message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF") + set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE) endif() if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) - message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile") + message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF") + set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE) endif() - set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR}) - set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY}) - message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h") - message(STATUS "cudnn library is ${CUDNN_LIBRARY}") + endif() + if(WITH_ANAKIN) + # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR + # is a softlink to real cudnn.h directory + set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/") + get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY) + set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY_DIR}) endif() elseif(WITH_AMD_GPU) add_definitions(-DPADDLE_WITH_HIP) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 9eebea816cbfc91052c95ecf99ecc4b0bea4e4c2..cd51533926de7bb132ab7bfab1686d664a331410 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -25,8 +25,25 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib - /usr/lib) -find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a + /usr/lib + ${CUDA_TOOLKIT_ROOT_DIR} + ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 + ) +set(CUDNN_LIB_NAME "") +if (LINUX) +set(CUDNN_LIB_NAME "libcudnn.so") +endif(LINUX) + +if(WIN32) +# only support cudnn7 +set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll") +endif(WIN32) + +if(Apple) +set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so") +endif(Apple) + +find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist} NO_DEFAULT_PATH DOC "Path to cuDNN library.") diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index 855897394a602736f3a4fa1132c634f8cb537468..5d11d238cd73653b60623bf7e32b94f5f1ac9ecc 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -19,17 +19,17 @@ execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-cer include_directories(${ANAKIN_INCLUDE}) include_directories(${ANAKIN_INCLUDE}/saber/) -set(ANAKIN_COMPILE_EXTRA_FLAGS +set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable - -Wno-error=unused-variable -Wno-unused-variable + -Wno-error=unused-variable -Wno-unused-variable -Wno-error=format-extra-args -Wno-format-extra-args - -Wno-error=comment -Wno-comment - -Wno-error=format -Wno-format + -Wno-error=comment -Wno-comment + -Wno-error=format -Wno-format -Wno-error=switch -Wno-switch - -Wno-error=return-type -Wno-return-type + -Wno-error=return-type -Wno-return-type -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor -Wno-sign-compare - -Wno-reorder + -Wno-reorder -Wno-error=cpp) ExternalProject_Add( @@ -47,6 +47,7 @@ ExternalProject_Add( -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml -DCUDNN_ROOT=${CUDNN_ROOT} + -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} ) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 1120677a37e0d44163816b66600121c8f0d545af..8ac157c4d79f1f5f2a655152f46b4a4d3f2c6962 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -142,6 +142,11 @@ else() ${GPU_COMMON_FLAGS}) endif() +if(UNIX AND NOT APPLE) + # except apple from nix*Os family + set(LINUX TRUE) +endif(UNIX AND NOT APPLE) + foreach(flag ${COMMON_FLAGS}) safe_set_cflag(CMAKE_C_FLAGS ${flag}) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 53c2de332ea74b06d1bd6e5bb119cad6af27ed01..3eacf4d86aa0385eddb690d72e85e3384929bb99 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID set(SSE3_FLAG "-msse3") set(AVX_FLAG "-mavx") set(AVX2_FLAG "-mavx2") + set(AVX512F_FLAG "-mavx512f") elseif(MSVC) set(MMX_FLAG "/arch:MMX") set(SSE2_FLAG "/arch:SSE2") @@ -81,5 +82,16 @@ int main() return 0; }" AVX2_FOUND) +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_undefined_epi32(); + return 0; +}" AVX512F_FOUND) + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) -mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND) +mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index fac9f16a89bab311c338475aef7c79015ab466be..2ec422cc17faf7f6b99ac70b5f175881bf017566 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -99,8 +99,13 @@ else() cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method) endif() - -cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass) +if (NOT WIN32) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + graph graph_viz_pass multi_devices_graph_pass + multi_devices_graph_print_pass multi_devices_graph_check_pass + fast_threaded_ssa_graph_executor) +endif() # NOT WIN32 cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 8f6c4163d6ee11fbe83f603f6148c2ac6175324d..abd5459f6d47da6d1341284916b419325dc5977c 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -42,3 +42,5 @@ cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_b cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor) #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory # device_context reduce_op_handle ) +cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc + DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 716d674fa29bad9321fc20979775c06f26bf4679..5183be878eb49cccc68603c3fdd8023be5578036 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -19,10 +19,13 @@ namespace framework { namespace details { struct ExecutionStrategy { + enum ExecutorType { kDefault = 0, kExperimental = 1 }; + size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; size_t num_iteration_per_drop_scope_{100}; + ExecutorType type_{kDefault}; }; } // namespace details diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..7606f2bc06b2ecf07c5649eeae1a2d5587a8880c --- /dev/null +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" +#include +#include +#include "paddle/fluid/framework/details/fetch_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &places, + std::unique_ptr &&graph) + : strategy_(strategy), + local_scopes_(local_scopes), + places_(places), + graph_(std::move(graph)), + pool_(strategy.num_threads_ + + 1), // add one more thread for generate op_deps + fetch_ctxs_(places) { + auto &ops = graph_->Get("ops"); + + for (auto &op : ops) { + int dep = static_cast(op->NotReadyInputSize()); + op_deps_.emplace(op.get(), dep); + if (dep == 0) { + bootstrap_ops_.emplace_back(op.get()); + } + } + + PrepareAtomicOpDeps(); +} + +FeedFetchList FastThreadedSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + std::unique_ptr>> + op_deps = atomic_op_deps_.get(); + PrepareAtomicOpDeps(); + + paddle::framework::FeedFetchList fetches; + fetches.resize(fetch_tensors.size()); + std::unordered_map> fetched_vars; + std::vector> fetch_nodes; + std::vector> fetch_ops; + + for (auto &fetch_var_name : fetch_tensors) { + for (auto &var_map : graph_->Get("vars")) { + auto it = var_map.find(fetch_var_name); + if (it != var_map.end()) { + fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get()); + } + } + } + + for (size_t i = 0; i < fetch_tensors.size(); ++i) { + auto &var_name = fetch_tensors[i]; + auto fetched_var_it = fetched_vars.find(var_name); + PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), + "Cannot find fetched variable.(Perhaps the main_program " + "is not set to ParallelExecutor)"); + + auto &vars = fetched_var_it->second; + + fetch_nodes.emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation)); + auto *op = new FetchOpHandle(fetch_nodes.back().get(), &fetches, i, + &local_scopes_); + fetch_ops.emplace_back(op); + + for (auto &p : places_) { + op->SetDeviceContext(p, fetch_ctxs_.Get(p)); + } + + for (auto *var : vars) { + op->AddInput(var); + } + + (*op_deps)[op] = static_cast(op->NotReadyInputSize()); + } + + size_t num_complete = 0; + remaining_ = 0; + BlockingQueue complete_q; + for (auto op : bootstrap_ops_) { + RunOpAsync(op_deps.get(), op, &complete_q); + } + + while (num_complete != op_deps->size()) { + size_t num_comp = complete_q.Pop(); + if (num_comp == -1UL) { + int remaining = 0; + while (true) { + remaining = remaining_; + if (remaining == 0) { + break; + } + for (int i = 0; i < remaining; ++i) { + complete_q.Pop(); + } + } + exception_.ReThrow(); + } + num_complete += num_comp; + } + // Wait FetchOps. + if (!fetch_ops.empty()) { + fetch_ops.clear(); + } + return fetches; +} +void FastThreadedSSAGraphExecutor::RunOpAsync( + std::unordered_map> *op_deps, + OpHandleBase *op, BlockingQueue *complete_q) { + ++remaining_; + this->pool_.enqueue([=] { + OpHandleBase *op_to_run = op; + size_t complete = 0; + while (op_to_run != nullptr) { + try { + op_to_run->Run(strategy_.use_cuda_); + ++complete; + } catch (...) { + exception_.Catch(std::current_exception()); + --remaining_; + complete_q->Push(-1UL); + return; + } + auto &outputs = op_to_run->Outputs(); + op_to_run = nullptr; + for (auto &output : outputs) { + for (auto &pending_op : output->PendingOps()) { + std::atomic &deps = op_deps->at(pending_op); + if (deps.fetch_sub(1) == 1) { // pending_op ready + if (op_to_run == nullptr) { + op_to_run = pending_op; + } else { + this->RunOpAsync(op_deps, pending_op, complete_q); + } + } + } + } + } + --remaining_; + complete_q->Push(complete); + }); +} +void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { + atomic_op_deps_ = pool_.enqueue([&] { + std::unordered_map> *op_deps = + new std::unordered_map>; + for (auto &pair : op_deps_) { + (*op_deps)[pair.first] = pair.second; + } + return std::unique_ptr< + std::unordered_map>>(op_deps); + }); +} + +const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..dad3a231cba6402f57ba654a9ac5fb520b9c8f04 --- /dev/null +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "ThreadPool.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/exception_holder.h" +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/ssa_graph_executor.h" + +namespace paddle { +namespace framework { +class Scope; +namespace details { + +class OpHandleBase; +class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { + public: + FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::unique_ptr &&graph); + FeedFetchList Run(const std::vector &fetch_tensors) override; + const ir::Graph &Graph() const override; + + private: + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::vector places_; + std::unique_ptr graph_; + + std::unordered_map op_deps_; + std::vector bootstrap_ops_; + + ::ThreadPool pool_; + platform::DeviceContextPool fetch_ctxs_; + std::atomic remaining_; + + void RunOpAsync(std::unordered_map> *op_deps, + OpHandleBase *op, BlockingQueue *complete_q); + + void PrepareAtomicOpDeps(); + + std::future< + std::unique_ptr>>> + atomic_op_deps_; + ExceptionHolder exception_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index ee9f9184da65467b82794c99fe3e95b108373753..3812f0abf1b7069525c4420054c61c01c908acfe 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -158,6 +158,16 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p, #endif } +size_t OpHandleBase::NotReadyInputSize() const { + std::unordered_set res; + for (auto *var : inputs_) { + if (var->GeneratedOp() != nullptr) { + res.emplace(var); + } + } + return res.size(); +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 2d7f18942890245249dd0619a40bb43833c9a2ee..9fbefabc841e3f6940860f60d959fee97495e4c9 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -81,6 +81,8 @@ class OpHandleBase { return res.size(); } + size_t NotReadyInputSize() const; + const std::vector &Outputs() const { return outputs_; } size_t NoDummyInputSize() const; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index f87d5212c0cd87a5a63cf2d54ca677516ab45816..2a6bf4ac230df81b38751000bf4b663f24984db3 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -117,7 +117,15 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { } // For output args, always create a new var. for (auto &each_var_name : op->OutputArgumentNames()) { - ir::Node *var = CreateVarNode(all_vars.at(each_var_name)); + ir::Node *var = nullptr; + if (all_vars.count(each_var_name) != 0) { + var = CreateVarNode(all_vars.at(each_var_name)); + } else { + // Operation output vars can be @EMPTY@. For example, while_grad + // can have multi @EMPTY@ outputs with no VarDesc. + // TODO(panyx0718): Add a test. + var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable); + } var_nodes[each_var_name].push_back(var); node->outputs.push_back(var); var->inputs.push_back(node); @@ -208,7 +216,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { // Add write after write dependence ir::Node *upstream_op = (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0]; - if (upstream_op) { + // TODO(zcd): Add a test. + if (upstream_op && upstream_op != write_op) { ir::Node *dep_var = CreateControlDepVar(); write_op->inputs.push_back(dep_var); upstream_op->outputs.push_back(dep_var); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 275cb8c592c3c0b153d31149570cd6596b9e1a7f..81cb24bdda6b87a3d708cf5047dce05d5020a0d5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #endif +#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" @@ -193,8 +194,14 @@ ParallelExecutor::ParallelExecutor( member_->local_scopes_, member_->use_cuda_, build_strategy); #endif - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, places, std::move(graph))); + } else { + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, places, std::move(graph))); + } + member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( exec_strategy, member_->local_scopes_, std::move(var_infos), member_->places_, std::move(member_->executor_))); diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 20bdc7830f32564448a69e9cd76c02585b7a1aca..344c001a69b53c82967ee983783892a514c2490b 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -55,11 +55,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { auto all_ops = blocks_[block_id]->AllOps(); for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) { auto &op = all_ops[op_id]; + for (const std::string &attr_name : op->AttrNames()) { if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) { int sub_block_id = o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name); op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); + } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) { + std::vector sub_block_ids = + o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name); + std::vector block_descs; + for (int block_id : sub_block_ids) { + block_descs.push_back(MutableBlock(block_id)); + } + op->SetBlocksAttr(attr_name, block_descs); } } } @@ -68,24 +77,16 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { desc_ = desc; - for (auto &block_desc : *desc_.mutable_blocks()) { - blocks_.emplace_back(new BlockDesc(this, &block_desc)); - } - for (auto &block : blocks_) { - for (auto *op : block->AllOps()) { - for (const auto &attr : op->Proto()->attrs()) { - if (attr.type() == proto::AttrType::BLOCK) { - size_t blk_idx = attr.block_idx(); - op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx)); - } - } - } - } + InitFromProto(); } ProgramDesc::ProgramDesc(const std::string &binary_str) { PADDLE_ENFORCE(desc_.ParseFromString(binary_str), "Fail to parse program_desc from binary string."); + InitFromProto(); +} + +void ProgramDesc::InitFromProto() { for (auto &block_desc : *desc_.mutable_blocks()) { blocks_.emplace_back(new BlockDesc(this, &block_desc)); } @@ -95,6 +96,13 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) { if (attr.type() == proto::AttrType::BLOCK) { size_t blk_idx = attr.block_idx(); op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx)); + } else if (attr.type() == proto::AttrType::BLOCKS) { + auto blks_idx = attr.blocks_idx(); + std::vector block_descs; + for (int blk_idx : blks_idx) { + block_descs.push_back(this->MutableBlock(blk_idx)); + } + op->SetBlocksAttr(attr.name(), block_descs); } } } diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index 65fa0a0cfd5ba6d9b8765cee1309e118cb74348a..f3afc85eb924e4b03b7597e043ffd4e267adc977 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -76,6 +76,8 @@ class ProgramDesc { void SetFetchHolderName(const std::string &fetch_holder_name); private: + void InitFromProto(); + proto::ProgramDesc desc_; std::vector> blocks_; diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc index 6c46e9aad5b7fbf67fdcc07a12e7932ac8b6412b..925ea98dbe62e4da91689f6e56c135e51c24a8a3 100644 --- a/paddle/fluid/framework/program_desc_test.cc +++ b/paddle/fluid/framework/program_desc_test.cc @@ -42,6 +42,19 @@ TEST(ProgramDesc, copy_ctor) { out->SetType(proto::VarType::LOD_TENSOR); op->SetOutput("Y", {out->Name()}); + BlockDesc* new_block = program.AppendBlock(*global_block); + op = new_block->AppendOp(); + op->SetType("mul"); + + op = global_block->AppendOp(); + op->SetType("op_with_subblock"); + op->SetAttr("sub_block", new_block); + + std::vector sub_blocks; + sub_blocks.push_back(program.AppendBlock(*global_block)); + sub_blocks.push_back(program.AppendBlock(*global_block)); + op->SetAttr("sub_blocks", sub_blocks); + ProgramDesc program_copy(program); auto* global_block_copy = program_copy.MutableBlock(0); @@ -64,6 +77,8 @@ TEST(ProgramDesc, copy_ctor) { assert_same_var("Y", y); assert_same_var("Out", out); + bool found_sub_block = false; + bool found_sub_blocks = false; for (size_t i = 0; i < global_block->OpSize(); ++i) { auto op_origin = global_block->Op(i); auto op_copy = global_block_copy->Op(i); @@ -74,8 +89,17 @@ TEST(ProgramDesc, copy_ctor) { ASSERT_EQ(op_copy->Proto()->SerializeAsString(), op_origin->Proto()->SerializeAsString()); - } + if (op->Type() == "op_with_subblock") { + ASSERT_EQ(1, op->GetBlockAttrId("sub_block")); + found_sub_block = true; + + ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size()); + found_sub_blocks = true; + } + } + ASSERT_TRUE(found_sub_block); + ASSERT_TRUE(found_sub_blocks); // Not check block's protostr are same it because the order of vars could be // different and it is correct. } diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index a72e27d651d0591815a9d93354d2aea8aa216de6..ce6c8f0474d7bb5cd67be2e6ef919835cf1c2a11 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -62,13 +62,13 @@ endif() if (WITH_ANAKIN AND WITH_GPU) # only needed in CI # compile the libinference_anakin_api.a and anakin.so. - nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) - #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endfunction() anakin_target(inference_anakin_api) - #anakin_target(inference_anakin_api_shared) + anakin_target(inference_anakin_api_shared) if (WITH_TESTING) cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin diff --git a/paddle/fluid/inference/api/high_level_api_cn.md b/paddle/fluid/inference/api/high_level_api_cn.md index 2fb914592cbcb1b0c3f2ef33ff9cf4c295e427b6..442c598978c700f4c438b365b8900db5b65bc5ec 100644 --- a/paddle/fluid/inference/api/high_level_api_cn.md +++ b/paddle/fluid/inference/api/high_level_api_cn.md @@ -65,13 +65,13 @@ config.model_dir = "xxx"; config.use_gpu = false; // 创建一个原生的 PaddlePredictor auto predictor = - paddle::CreatePaddlePredictor(config); + paddle::CreatePaddlePredictor(config); // 创建输入 tensor int64_t data[4] = {1, 2, 3, 4}; paddle::PaddleTensor tensor{.name = "", .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + .data = paddle::PaddleBuf(data, sizeof(data)), + .dtype = paddle::PaddleDType::INT64}; // 创建输出 tensor,输出 tensor 的内存可以复用 std::vector outputs; // 执行预测 diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index b52d083f280e5e7713600a7b748dedd37aca0a1e..a610687a5b11999a7cb7426dbe961e5972ee1746 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,4 @@ -nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto) +nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) add_subdirectory(convert) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index e29fe2a42bd1aaee1ea8c01159e331cf47ca6b72..ed8e9ed77fb233e40bb78329a246ff724b21c547 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -84,6 +84,15 @@ function(op_library TARGET) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") endif() + #remove windows unsupported op + if (WIN32) + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") + if ("${TARGET}" STREQUAL "${windows_unsupport_op}") + return() + endif() + endforeach() + endif(WIN32) + list(LENGTH op_library_DEPS op_library_DEPS_len) if (${op_library_DEPS_len} GREATER 0) set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) @@ -181,19 +190,19 @@ function(op_library TARGET) endfunction() add_subdirectory(math) +if (NOT WIN32) add_subdirectory(nccl) - if(WITH_GPU) op_library(nccl_op DEPS nccl_common) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") else() set(DEPS_OPS ${DEPS_OPS} nccl_op) endif() +endif() # NOT WIN32 set(DISTRIBUTE_DEPS "") if(WITH_DISTRIBUTE) add_subdirectory(distributed) - set(DISTRIBUTE_DEPS "") if(WITH_GRPC) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) @@ -222,7 +231,7 @@ if(WITH_DISTRIBUTE) #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op # listen_and_serv_op sum_op executor SERIAL) - if(WITH_GPU) + if(WITH_GPU AND NOT WIN32) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL) if(WITH_GRPC) @@ -233,7 +242,7 @@ if(WITH_DISTRIBUTE) set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op) - endif() + endif() # WITH_GPU AND NOT WIN32 else() set(DEPS_OPS ${DEPS_OPS} checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op) endif() @@ -331,5 +340,7 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) +if(NOT WIN32) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) +endif() nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc index 580fde753816c30b188b8a99cc63fcbafde64e25..135254ce6b6bf9add7bb1f0c3f645ed47081fba4 100644 --- a/paddle/fluid/operators/conditional_block_op.cc +++ b/paddle/fluid/operators/conditional_block_op.cc @@ -29,9 +29,9 @@ class ConditionalOp : public framework::OperatorBase { protected: std::vector InputTensors( - const framework::Scope &scope) const { + const framework::Scope &scope, const std::string &in_name) const { std::vector retv; - auto xs = Inputs("X"); + auto xs = Inputs(in_name); retv.resize(xs.size(), nullptr); std::transform( xs.begin(), xs.end(), retv.begin(), @@ -81,12 +81,18 @@ class ConditionalBlockOp : public ConditionalOp { private: void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override { - auto xs = InputTensors(scope); - bool need_run; if (Attr("is_scalar_condition")) { + // When is_scalar_condition is True, the conditional variable is a scalar, + // whether need to execute the operators in sub-block depends on the + // conditional variable (Cond). + auto xs = InputTensors(scope, "Cond"); need_run = ScalarCondition(xs); } else { + // When is_scalar_condition is False, the conditional variable maybe a + // vector or tensor, whether need to execute the operators in sub-block + // depends on the input variables (Input). + auto xs = InputTensors(scope, "Input"); need_run = std::all_of( xs.begin(), xs.end(), [](const framework::LoDTensor *t) { return t->numel() != 0; }); @@ -110,11 +116,11 @@ class ConditionalBlockOp : public ConditionalOp { class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", - "The conditional variable of this operator. If X is empty, the " + AddInput("Cond", + "The conditional variable of this operator. If Cond is empty, the " "whole sub-block will not be executed.") .AsDuplicable(); - AddInput("Params", "The input variables of the sub-block.").AsDuplicable(); + AddInput("Input", "The input variables of the sub-block.").AsDuplicable(); AddOutput("Out", "The output variables of the sub-block.").AsDuplicable(); AddOutput("Scope", "(std::vector) The step scope of conditional block. To " @@ -123,13 +129,18 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddAttr( "sub_block", "The step block of conditional block operator"); AddAttr("is_scalar_condition", - "the input X is used as scalar " - "condition") + "The conditional variable (Cond) is used as scalar " + "condition.") .SetDefault(false); AddComment(R"DOC(Conditional block operator -Run the sub-block if X is not empty. Params is the other inputs and Out is the -outputs of the sub-block. +If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar, +run the operators in sub-block if Cond is True. + +If `is_scalar_condition` is False, the conditional variable (Cond) is a vector or +tensor, run the operators in sub-block if all of input variables are not empty. + + )DOC"); } }; @@ -145,12 +156,12 @@ class ConditionalBlockGradOp : public ConditionalOp { private: void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override { - auto xs = this->InputTensors(scope); - bool need_run; if (Attr("is_scalar_condition")) { + auto xs = this->InputTensors(scope, "Cond"); need_run = ScalarCondition(xs); } else { + auto xs = this->InputTensors(scope, "Input"); need_run = std::all_of( xs.begin(), xs.end(), [](const framework::LoDTensor *t) { return t->numel() != 0; }); @@ -166,11 +177,11 @@ class ConditionalBlockGradOp : public ConditionalOp { auto *block = Attr("sub_block"); exec.Run(*block->Program(), &cur_scope, block->ID(), false); - AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"), - Outputs(framework::GradVarName("Params"))); + AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"), + Outputs(framework::GradVarName("Input"))); - AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"), - Outputs(framework::GradVarName("X"))); + AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"), + Outputs(framework::GradVarName("Cond"))); } } @@ -199,15 +210,15 @@ class ConditionalBlockGradOp : public ConditionalOp { class ConditionalBlockGradInferShape : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasInputs("X")); - if (context->HasInputs("Params")) { - PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params"))); - context->SetOutputsDim(framework::GradVarName("Params"), - context->GetInputsDim("Params")); + PADDLE_ENFORCE(context->HasInputs("Cond")); + if (context->HasInputs("Input")) { + PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input"))); + context->SetOutputsDim(framework::GradVarName("Input"), + context->GetInputsDim("Input")); } - if (context->HasOutputs(framework::GradVarName("X"))) { - context->SetOutputsDim(framework::GradVarName("X"), - context->GetInputsDim("X")); + if (context->HasOutputs(framework::GradVarName("Cond"))) { + context->SetOutputsDim(framework::GradVarName("Cond"), + context->GetInputsDim("Cond")); } } }; @@ -220,14 +231,15 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker { std::unique_ptr Apply() const override { auto grad_op = new framework::OpDesc(); grad_op->SetType("conditional_block_grad"); - grad_op->SetInput("X", Input("X")); - grad_op->SetInput("Params", Input("Params")); + grad_op->SetInput("Cond", Input("Cond")); + grad_op->SetInput("Input", Input("Input")); grad_op->SetInput("Out", Output("Out")); grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); grad_op->SetInput("Scope", Output("Scope")); - grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); - grad_op->SetOutput(framework::GradVarName("Params"), - InputGrad("Params", false)); + grad_op->SetOutput(framework::GradVarName("Cond"), + InputGrad("Cond", false)); + grad_op->SetOutput(framework::GradVarName("Input"), + InputGrad("Input", false)); grad_op->SetBlockAttr("sub_block", this->grad_block_[0]); grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition")); return std::unique_ptr(grad_op); diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 3f5fab3b382bea97f43e4bc1b2cd436c956ba264..8181897c3d3844bda5574e85a08b2af038fcd664 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel { int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); +#ifdef __AVX__ +// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or +// 16 elements per iteration. Then it can implement the parallel processing. +// Only optimize for float type. +#ifdef __AVX512F__ + size_t step_size = 16; +#else + size_t step_size = 8; +#endif + if (std::is_same::value && (tag_num >= step_size)) { + size_t steps = tag_num / step_size; + size_t remain = tag_num % step_size; + int last_offset = static_cast(remain) - static_cast(step_size); + + // Setup the alpha initial value. + size_t i_offset = 0; + for (size_t i = 0; i <= steps; ++i) { +#ifdef __AVX512F__ + // Declare the variable for the content of weights, input and alpha + // values. + __m512 w_content, x_content, alpha_content; + + // Load the relevant data into the variables from un-aligned address. + w_content = _mm512_loadu_ps((const float*)(w + i_offset)); + x_content = _mm512_loadu_ps((const float*)(x + i_offset)); + alpha_content = _mm512_add_ps(w_content, x_content); + + // Save the alpha value. + _mm512_storeu_ps(reinterpret_cast(alpha_value + i_offset), + alpha_content); +#else + // Declare the variable for the content of weights, input and alpha + // values. + __m256 w_content, x_content, alpha_content; + + // Load the relevant data into the variables from un-aligned address. + w_content = _mm256_loadu_ps((const float*)(w + i_offset)); + x_content = _mm256_loadu_ps((const float*)(x + i_offset)); + alpha_content = _mm256_add_ps(w_content, x_content); + + // Save the alpha value. + _mm256_storeu_ps(reinterpret_cast(alpha_value + i_offset), + alpha_content); +#endif + i_offset += step_size; + if (i == steps - 1) { + if (remain > 0) { + i_offset += last_offset; + } else { + break; + } + } + } + + // Use the column-major strategy to get the location of maximum score. + size_t seq_offset = 0; + for (size_t k = 1; k < seq_len; ++k) { + size_t j_offset = 0; + for (size_t j = 0; j <= steps; ++j) { +#ifdef __AVX512F__ + // Initialize the variables of maximum score and location. + __m512 max_score = _mm512_set1_ps(-std::numeric_limits::max()); + __m512i max_j = _mm512_setzero_si512(); +#else + // Initialize the variables of maximum score and location. + __m256 max_score = _mm256_set1_ps(-std::numeric_limits::max()); + __m256i max_j = _mm256_set1_epi32(0); +#endif + // Calculate the offset of transition_weights. + size_t trans_offset = state_trans_base_idx * tag_num + j_offset; + for (size_t i = 0; i < tag_num; ++i) { +#ifdef __AVX512F__ + // Initalize the content of alpha variable with related offset. + __m512 alpha_content = + _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i)); + // Obtain the content of weights from un-aligned address. + __m512 w_content = + _mm512_loadu_ps((const float*)(w + trans_offset)); + + __m512 score_v = _mm512_add_ps(alpha_content, w_content); + + __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); + + // According to the mask value, it update the index of the max_score + // location. + max_j = _mm512_mask_set1_epi32(max_j, mask, i); + + // Update the max_score value. + max_score = _mm512_max_ps(max_score, score_v); +#else + // Initalize the content of alpha variable with related offset. + __m256 alpha_content = _mm256_broadcast_ss( + (const float*)(alpha_value + seq_offset + i)); + // Obtain the content of weights from un-aligned address. + __m256 w_content = + _mm256_loadu_ps((const float*)(w + trans_offset)); + __m256 score_v = _mm256_add_ps(alpha_content, w_content); + + __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); + +#ifdef __AVX2__ + // According to the mask value, it update the index of the max_score + // location. + max_j = _mm256_or_si256( + _mm256_andnot_si256((__m256i)mask, max_j), + _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i))); +#else + __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); + __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); + __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); + __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); + + lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); + hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); + lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); + hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i)); + + lo_max_j = _mm_or_si128(lo_mask, lo_max_j); + hi_max_j = _mm_or_si128(hi_mask, hi_max_j); + + // According to the mask value, it update the index of the max_score + // location. + max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0); + max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1); +#endif + + // Update the max_score value. + max_score = _mm256_max_ps(max_score, score_v); +#endif + trans_offset += tag_num; + } + +#ifdef __AVX512F__ + // Update the alpha and track values. + __m512 x_content = _mm512_loadu_ps( + (const float*)(x + seq_offset + tag_num + j_offset)); + max_score = _mm512_add_ps(max_score, x_content); + _mm512_storeu_ps(reinterpret_cast(alpha_value + seq_offset + + tag_num + j_offset), + max_score); + _mm512_storeu_si512( + reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num + + j_offset), + max_j); +#else + // Update the alpha and track values. + __m256 x_content = _mm256_loadu_ps( + (const float*)(x + seq_offset + tag_num + j_offset)); + max_score = _mm256_add_ps(max_score, x_content); + _mm256_storeu_ps(reinterpret_cast(alpha_value + seq_offset + + tag_num + j_offset), + max_score); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num + + j_offset), + max_j); +#endif + + // Calculate the offset of next step + j_offset += step_size; + if (j == steps - 1) { + if (remain > 0) { + j_offset += last_offset; + } else { + break; + } + } + } + + seq_offset += tag_num; + } + } else { + for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; + + for (size_t k = 1; k < seq_len; ++k) { + for (size_t i = 0; i < tag_num; ++i) { + T max_score = -std::numeric_limits::max(); + int max_j = 0; + for (size_t j = 0; j < tag_num; ++j) { + T score = alpha_value[(k - 1) * tag_num + j] + + w[(j + state_trans_base_idx) * tag_num + i]; + if (score > max_score) { + max_score = score; + max_j = j; + } + } + + alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i]; + track_value[k * tag_num + i] = max_j; + } + } + } +#else for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; for (size_t k = 1; k < seq_len; ++k) { @@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel { } } +#endif T max_score = -std::numeric_limits::max(); int max_i = 0; for (size_t i = 0; i < tag_num; ++i) { diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index bc3e95e904f8b6c2cdd2ae6685bf67580178e6b6..f90dcdc156590b776f817a4933d5a9b45868ba98 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -80,6 +80,9 @@ inline framework::DDim trim_trailing_singular_dims( for (int i = 0; i < actual_dims_size; ++i) { trim_dims[i] = dims[i]; } + if (trim_dims.size() == 0) { + return framework::DDim(framework::make_dim()); + } framework::DDim actual_dims = framework::make_ddim(trim_dims); return actual_dims; } diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 099ca52c8e945a0e93c2f13adb612158c67397cf..72287ae6ac60f8de5eb62733791b0c9353dbe86b 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -15,8 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fc_op.h" #include #include "paddle/fluid/operators/math/blas.h" - -DECLARE_int32(paddle_num_threads); +#include "paddle/fluid/operators/math/fc_compute.h" namespace paddle { namespace operators { @@ -110,13 +109,8 @@ void FCOpMaker::Make() { AddComment(R"DOC( Fully Connected Operator. - The fully connected operation calculates the output based on the input, weights and bias attribute. + The fully connected operation calculates the output based on the input, weights and bias. The size of each dimension of the parameters checked in the infer-shape. - The matrix of bias is generated by the mkldnn framework, when the bias_attr is True. - Additional parametrs are use_mkldnn and bias_attr. - The input(X) size and output(Out) size may be diffrent. - - The fully connected layer only supports MKLDNN version )DOC"); } @@ -133,26 +127,15 @@ class FCOpKernel : public framework::OpKernel { auto in_dims = input->dims(); auto w_dims = w->dims(); - auto& dev_ctx = ctx.template device_context(); - auto blas = math::GetBlas(dev_ctx); const T* input_data = input->data(); const T* w_data = w->data(); T* output_data = output->mutable_data(ctx.GetPlace()); + auto blas = math::GetBlas(ctx); + math::FCCompute( + blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data, + bias ? bias->data() : NULL); - blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0], - static_cast(1), input_data, w_data, static_cast(0), - output_data); - - if (bias) { - const T* bias_data = bias->data(); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for if (FLAGS_paddle_num_threads > 1) -#endif - for (int bs = 0; bs < in_dims[0]; bs++) { - blas.AXPY(w_dims[1], static_cast(1), bias_data, - output_data + bs * w_dims[1]); - } - } + // TODO(TJ): fuse act } }; diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3888333ec5626f1d8d35db215085f483c985cf0a --- /dev/null +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -0,0 +1,354 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fusion_lstm_op.h" +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { + +void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightX"), + "Input(WeightX) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightH"), + "Input(WeightH) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("XX"), + "Output(XX) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(Hidden) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Output(Cell) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"), + "Output(BatchedGate) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), + "Output(BatchedGate) of LSTM should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(Cell) and Input(Hidden) of LSTM should not " + "be null at the same time."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + } + + auto wx_dims = ctx->GetInputDim("WeightX"); + PADDLE_ENFORCE_EQ(wx_dims.size(), 2, + "The rank of Input(WeightX) should be 2."); + PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1], + "The first dimension of Input(WeightX) " + "should be %d.", + x_dims[1]); + + int frame_size = wx_dims[1] / 4; + auto wh_dims = ctx->GetInputDim("WeightH"); + PADDLE_ENFORCE_EQ(wh_dims.size(), 2, + "The rank of Input(WeightH) should be 2."); + PADDLE_ENFORCE_EQ(wh_dims[0], frame_size, + "The first dimension of Input(WeightH) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size, + "The second dimension of Input(WeightH) " + "should be 4 * %d.", + frame_size); + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + + PADDLE_ENFORCE(!ctx->Attrs().Get("use_peepholes"), + "Do not support peephole yet."); + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes connection", + frame_size); + + framework::DDim out_dims({x_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]}); + ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->ShareLoD("X", "Hidden"); + ctx->ShareLoD("X", "Cell"); + + int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + ctx->SetOutputDim("XX", {x_dims[0], xx_width}); + ctx->ShareLoD("X", "XX"); +} + +framework::OpKernelType FusionLSTMOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); +} + +void FusionLSTMOpMaker::Make() { + AddInput("X", + "(LoDTensor) the input is a LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X M), where T is the " + "total time steps in this mini-batch, M is the dim size of x."); + AddInput("WeightX", + "(Tensor) the learnable weights of X." + " - The shape is (M x 4D), where M is the dim size of x, D is the " + "hidden size. " + " - Weight = {W_cx, W_ix, W_fx, W_ox}"); + AddInput("WeightH", + "(Tensor) same as LSTMOp, the learnable hidden-hidden weights." + " - The shape is (D x 4D), where D is the hidden size. " + " - Weight = {W_ch, W_ih, W_fh, W_oh}"); + AddInput("Bias", + "(Tensor) the learnable weights. Almost same as LSTMOp" + "Note: we should add the fc bias into this (1x4D) in bias." + "input-hidden bias weight and peephole connections weight if " + "setting `use_peepholes` True. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddInput("H0", + "(Tensor, optional) (same as LSTMOp) the initial hidden state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) (same as LSTMOp) (the initial cell state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `H0` and `C0` can be NULL but only at the same time.") + .AsDispensable(); + AddOutput("Hidden", + "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("Cell", + "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("XX", + "(LoDTensor) the result after X * WeightX (size is T x 4D)" + " or batched_X (size is T x M), this will be automatically chosen," + " where T is the total time steps in this mini-batch," + " D is the hidden size, M is the dim size of x input.") + .AsIntermediate(); + AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate(); + AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).") + .AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTM.") + .SetDefault(false); + AddAttr("gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Fusion Long-Short Term Memory (LSTM) Operator. +This operator fuse the X into LSTM, more details can refer to LSTM op. +)DOC"); +} + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index_lod, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + // TODO(TJ): check mem copy perf + row_shuffle(ctx, src, index_lod, dst, indexed_src); +} + +template +class FuisonLSTMKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* wx = ctx.Input("WeightX"); + auto* wh = ctx.Input("WeightH"); + auto* bias = ctx.Input("Bias"); + auto* hidden_t0 = ctx.Input("H0"); + auto* cell_t0 = ctx.Input("C0"); + + auto* xx = ctx.Output("XX"); + auto* batched_gate = ctx.Output("BatchedGate"); + auto* hidden_out = ctx.Output("Hidden"); + auto* cell_out = ctx.Output("Cell"); + bool is_reverse = ctx.Attr("is_reverse"); + + T* xx_data = xx->mutable_data(ctx.GetPlace()); + T* batched_gate_data = batched_gate->mutable_data(ctx.GetPlace()); + hidden_out->mutable_data(ctx.GetPlace()); + cell_out->mutable_data(ctx.GetPlace()); + + const T* x_data = x->data(); + const T* wx_data = wx->data(); + auto x_dims = x->dims(); + auto wx_dims = wx->dims(); + + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + if (x_dims[1] > wx_dims[1]) { + math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], + x_data, wx_data, xx_data, + bias->data()); + to_batch(dev_ctx, *xx, batched_gate, true, is_reverse); + } else { + to_batch(dev_ctx, *x, xx, true, is_reverse); + batched_gate->set_lod(xx->lod()); + math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], + xx_data, wx_data, batched_gate_data, + bias->data()); + } + + int frame_size = static_cast(wx_dims[1] / 4); + framework::DDim out_dims({x_dims[0], frame_size}); + math::LstmMetaValue lstm_value; + // no peephole + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; + lstm_value.prev_state_value = nullptr; + Tensor ordered_c0; + + framework::Vector order(batched_gate->lod()[2]); + + if (cell_t0) { + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState(dev_ctx, *cell_t0, order, &ordered_c0, + true); + lstm_value.prev_state_value = ordered_c0.data(); + } + + // Use the local variable as here. + LoDTensor batch_hidden, batch_cell; + auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); + batch_hidden.mutable_data(out_dims, ctx.GetPlace()); + batch_cell.mutable_data(out_dims, ctx.GetPlace()); + batch_cell_pre_act->mutable_data(out_dims, ctx.GetPlace()); + + auto batch_starts = batched_gate->lod()[0]; + size_t max_seq_len = batch_starts.size() - 1; + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + + for (size_t n = 0; n < max_seq_len; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate_t = batched_gate->Slice(bstart, bend); + Tensor out_t = batch_hidden.Slice(bstart, bend); + Tensor cell_t = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); + + int cur_batch_size = bend - bstart; + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); + // TODO(TJ): use gemm directly + blas.MatMul(pre_hidden_t, false, *wh, false, static_cast(1.0), + &gate_t, static_cast(1.0)); + } else if (hidden_t0) { + // TODO(TJ): move h0 outside for + // If n == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros, the calculation W_h * H0 will be skiped. + // If n == 0 and there is initialized hidden state, calculate W_h * H0. + + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized hidden state also needs + // to reorder. + Tensor ordered_h0; + ReorderInitState(dev_ctx, *hidden_t0, order, + &ordered_h0, true); + // TODO(TJ): use gemm directly + blas.MatMul(ordered_h0, false, *wh, false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } + + lstm_value.gate_value = gate_t.data(); + lstm_value.output_value = out_t.data(); + lstm_value.state_value = cell_t.data(); + lstm_value.state_active_value = cell_pre_act_t.data(); + math::LstmUnitFunctor::compute( + dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act, + cand_act); + lstm_value.prev_state_value = lstm_value.state_value; + } + + math::Batch2LoDTensorFunctor to_seq; + batch_hidden.set_lod(batched_gate->lod()); + // restore the output hidden in LoDTensor from the batch hidden + to_seq(dev_ctx, batch_hidden, hidden_out); + + batch_cell.set_lod(batched_gate->lod()); + // restore the output cell state in LoDTensor from the batch cell + to_seq(dev_ctx, batch_cell, cell_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL( + fusion_lstm, + ops::FuisonLSTMKernel, + ops::FuisonLSTMKernel); diff --git a/paddle/fluid/operators/fusion_lstm_op.h b/paddle/fluid/operators/fusion_lstm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..39dc09b4d116193399d8ac9a51e88dbc3e239918 --- /dev/null +++ b/paddle/fluid/operators/fusion_lstm_op.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +// #include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusionLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..8600fa9e2c4db9d54cbe0ffb68f82d52c086d4f7 --- /dev/null +++ b/paddle/fluid/operators/math/fc_compute.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/blas.h" + +DECLARE_int32(paddle_num_threads); + +namespace paddle { +namespace operators { +namespace math { + +template +inline void FCCompute(const BlasT& blas, const int M, + const int N, const int K, const T* X, const T* W, T* Y, + const T* B = NULL) { + blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast(1), X, W, + static_cast(0), Y); + if (B) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for if (FLAGS_paddle_num_threads > 1) +#endif + for (int i = 0; i < M; i++) { + blas.AXPY(N, static_cast(1), B, Y + i * N); + } + } +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt index ce0ddd89bfb0d73e237a6f9a777376624d8ef2d4..cdcba8035762d8f442eb8b8ed52a4e3e99ac31b6 100644 --- a/paddle/fluid/operators/nccl/CMakeLists.txt +++ b/paddle/fluid/operators/nccl/CMakeLists.txt @@ -1,3 +1,3 @@ -if(WITH_GPU) +if(WITH_GPU AND NOT WIN32) nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) endif() diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index 6c507baf3a0ab0a557d29a53700685753616193b..8a683116b8054de12fc4419b5aa5fbc019b675bb 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -23,9 +23,9 @@ class SqueezeOpInferShape : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of SqueezeOp should not be null."); + "Input(X) of Squeeze operator should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of SqueezeOp should not be null."); + "Output(Out) of Squeeze operator should not be null."); const auto &x_dims = ctx->GetInputDim("X"); // Check input tensor dims (<6) Eigen limit. @@ -107,7 +107,6 @@ class SqueezeOp : public framework::OperatorBase { framework::AttributeMap attrs; attrs["shape"] = framework::vectorize2int(out_dims); - attrs["inplace"] = Attr("inplace"); // Invoke Reshape Op auto reshape_op = framework::OpRegistry::CreateOp( "reshape", {{"X", {Input("X")}}, {"Shape", {}}}, @@ -125,12 +124,6 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { "(std::vector). List of integers," " indicating the dimensions to squeeze.") .SetDefault({}); - AddAttr("inplace", - "(default: false) Squeeze the source tensor's shape without " - "memory copy. When Attr(inplace) is set true, the output " - "tensor shares memory with Input(X), otherwise, a new output " - "tensor is created, and its data are copied from Input(x).") - .SetDefault(false); AddComment(R"DOC( Squeeze Operator. @@ -180,7 +173,6 @@ class SqueezeGradOp : public framework::OperatorBase { auto x_dims = scope.FindVar(Input("X"))->Get().dims(); framework::AttributeMap attrs; attrs["shape"] = framework::vectorize2int(x_dims); - attrs["inplace"] = Attr("inplace"); auto reshape_op = framework::OpRegistry::CreateOp( "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc index f2a15fdf572e0de30f9949dda5020e130b0c5585..0fc8d54f6400c9dfb6af1e764ed44e95195bfe6e 100644 --- a/paddle/fluid/operators/unsqueeze_op.cc +++ b/paddle/fluid/operators/unsqueeze_op.cc @@ -23,9 +23,9 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of UnsqueezeOp should not be null."); + "Input(X) of Unsqueeze operator should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of UnsqueezeOp should not be null."); + "Output(Out) of Unsqueeze operator should not be null."); const auto &axes = ctx->Attrs().Get>("axes"); const auto &x_dims = ctx->GetInputDim("X"); @@ -95,7 +95,6 @@ class UnsqueezeOp : public framework::OperatorBase { framework::AttributeMap attrs; attrs["shape"] = framework::vectorize2int(out_dims); - attrs["inplace"] = Attr("inplace"); // Invoke Reshape op. auto reshape_op = framework::OpRegistry::CreateOp( "reshape", {{"X", {Input("X")}}, {"Shape", {}}}, @@ -126,13 +125,6 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker { " within [1, 6] dimensions (Eigen limit)."); } }); - AddAttr( - "inplace", - "(default: false) Unsqueeze the source tensor's shape without " - "memory copy. When Attr(inplace) is set true, the output " - "tensor shares memory with Input(X), otherwise, a new output " - "tensor is created, and its data are copied from Input(x).") - .SetDefault(false); AddComment(R"DOC( Unsqueeze Operator. @@ -168,7 +160,6 @@ class UnsqueezeGradOp : public framework::OperatorBase { framework::AttributeMap attrs; attrs["shape"] = framework::vectorize2int(x_dims); - attrs["inplace"] = Attr("inplace"); auto reshape_op = framework::OpRegistry::CreateOp( "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 9da787a4073fa002f75154f7c4fba54e9ed8efa6..07159d4a12ef4b628f7705ed206d3334be46dfc8 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -3,7 +3,7 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc) # There is no macOS version of NCCL. -if (NOT APPLE) +if (NOT APPLE AND NOT WIN32) list(APPEND CUDA_SRCS nccl.cc) endif() diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 81b5359b40589d898bda0dfa71afb6f51385354b..6c2331b75f64b777adcdca4245d503bb5a52e1a6 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -44,7 +44,7 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/curand.h" -#ifndef __APPLE__ +#if !defined(__APPLE__) and !defined(_WIN32) #include "paddle/fluid/platform/dynload/nccl.h" #endif // __APPLE__ #endif // PADDLE_WITH_CUDA @@ -205,7 +205,7 @@ inline typename std::enable_if::type throw_on_error( #endif } -#ifndef __APPLE__ +#if !defined(__APPLE__) and !defined(_WIN32) template inline typename std::enable_if::type throw_on_error( ncclResult_t stat, const Args&... args) { @@ -221,7 +221,7 @@ inline typename std::enable_if::type throw_on_error( #endif } } -#endif // __APPLE__ +#endif // __APPLE__ and windows #endif // PADDLE_WITH_CUDA template diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6c58478b0dd0941ab4bf4d573a3c813059650ba8..67734659233515ca8110f4212a2b1553fe4e9d24 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -596,8 +596,8 @@ All parameter, weight, gradient are variables in Paddle. // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); - py::class_(pe, "ExecutionStrategy") - .def(py::init()) + py::class_ exec_strategy(pe, "ExecutionStrategy"); + exec_strategy.def(py::init()) .def_property( "num_threads", [](const ExecutionStrategy &self) { return self.num_threads_; }, @@ -624,6 +624,16 @@ All parameter, weight, gradient are variables in Paddle. [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; }); + exec_strategy.def_property( + "use_experimental_executor", + [](const ExecutionStrategy &self) { + return self.type_ == ExecutionStrategy::kExperimental; + }, + [](ExecutionStrategy &self, bool experimental) { + self.type_ = experimental ? ExecutionStrategy::kExperimental + : ExecutionStrategy::kDefault; + }); + py::class_ build_strategy(pe, "BuildStrategy"); py::enum_(build_strategy, "ReduceStrategy") diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 173567a0a374ac4453025b67b047950936df2055..8bfe11916bd069cd2dd7016c03644d6cad1e188d 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -1272,8 +1272,8 @@ class ConditionalBlock(object): parent_block.append_op( type='conditional_block', inputs={ - 'X': self.inputs, - 'Params': param_list, + 'Cond': self.inputs, + 'Input': param_list, }, outputs={'Out': out_list, 'Scope': [step_scope]}, diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py index 10918a985f80f9f20629180b0359ce73f448d33b..61d81f483636a99ea9e0282de89f12e47f3b824c 100644 --- a/python/paddle/fluid/tests/test_if_else_op.py +++ b/python/paddle/fluid/tests/test_if_else_op.py @@ -30,7 +30,8 @@ import numpy as np class TestMNISTIfElseOp(unittest.TestCase): - def test_raw_api(self): + # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379 + def not_test_raw_api(self): prog = Program() startup_prog = Program() with program_guard(prog, startup_prog): @@ -91,7 +92,8 @@ class TestMNISTIfElseOp(unittest.TestCase): return self.assertFalse(True) - def test_ifelse(self): + # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379 + def not_test_ifelse(self): prog = Program() startup_prog = Program() with program_guard(prog, startup_prog): @@ -153,6 +155,13 @@ class TestIfElse(unittest.TestCase): self.cond_value = 0.5 self.data = np.random.rand(25, 1).astype(np.float32) + def numpy_cal(self): + s1 = self.data[np.where(self.data < self.cond_value)] + res = np.sum(np.exp(s1)) + s2 = self.data[np.where(self.data >= self.cond_value)] + res += np.sum(np.tanh(s2)) + return res + def compare_ifelse_op_and_numpy(self, place): self.set_test_case() @@ -166,10 +175,12 @@ class TestIfElse(unittest.TestCase): ie = layers.IfElse(ifcond) with ie.true_block(): true_target = ie.input(src) + true_target = fluid.layers.exp(true_target) ie.output(true_target) with ie.false_block(): false_target = ie.input(src) + false_target = fluid.layers.tanh(false_target) ie.output(false_target) if_out = ie() out = layers.reduce_sum(if_out) @@ -180,7 +191,8 @@ class TestIfElse(unittest.TestCase): o1, = exe.run(fluid.default_main_program(), feed={'data': self.data}, fetch_list=[out]) - o2 = np.sum(self.data) + o2 = self.numpy_cal() + self.assertTrue( np.allclose( o1, o2, atol=1e-8), diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 9be53c4609d5027dd4e482ab936a135420a346ae..74e9d5c5f91e53a315c85d428571ce45bacede8a 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -38,7 +38,8 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, - optimizer=fluid.optimizer.Adam): + optimizer=fluid.optimizer.Adam, + use_fast_executor=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -71,6 +72,8 @@ class TestParallelExecutorBase(unittest.TestCase): startup_exe.run(startup) exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = allow_op_delay + if use_fast_executor: + exec_strategy.use_experimental_executor = True build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py index 2bb920710a9b10f3a8159bad3b33dd15ffbada19..ff417ad2f16b83cd42a0603375c14450195e7fc0 100644 --- a/python/paddle/fluid/tests/unittests/test_fc_op.py +++ b/python/paddle/fluid/tests/unittests/test_fc_op.py @@ -64,27 +64,47 @@ class TestFCOp(OpTest): self.check_output() -class TestFCOpBiasBoth(TestFCOp): +class TestFCOpNoBias(TestFCOp): def init_shapes(self, mb, ic, oc, h, w): - for with_bias in {True, False}: - self.with_bias = with_bias - self.matrix = MatrixGenerate(mb, ic, oc, h, w) + self.with_bias = False + self.matrix = MatrixGenerate(mb, ic, oc, h, w) -class TestFCOp1(TestFCOpBiasBoth): +class TestFCOpWithBias(TestFCOp): + def init_shapes(self, mb, ic, oc, h, w): + self.with_bias = True + self.matrix = MatrixGenerate(mb, ic, oc, h, w) + + +class TestFCOp1(TestFCOpNoBias): def init_op_type(self): self.init_shapes(2, 8, 10, 1, 1) -class TestFCOp2(TestFCOpBiasBoth): +class TestFCOp2(TestFCOpNoBias): def init_op_type(self): self.init_shapes(4, 5, 6, 2, 2) -class TestFCOp4(TestFCOpBiasBoth): +class TestFCOp4(TestFCOpNoBias): def init_op_type(self): self.init_shapes(1, 32, 64, 3, 3) +class TestFCOpWithBias1(TestFCOpWithBias): + def init_op_type(self): + self.init_shapes(3, 8, 10, 2, 1) + + +class TestFCOpWithBias2(TestFCOpWithBias): + def init_op_type(self): + self.init_shapes(4, 5, 6, 2, 2) + + +class TestFCOpWithBias3(TestFCOpWithBias): + def init_op_type(self): + self.init_shapes(1, 64, 32, 3, 3) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py new file mode 100644 index 0000000000000000000000000000000000000000..9d8bef677fd16fb6bdc20b929137b4d885f4efd1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py @@ -0,0 +1,151 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +from test_lstm_op import lstm, ACTIVATION + + +def fc(x, w, b): + return np.dot(x, w) + b + + +def fusion_lstm( + x, # T x M + lod, # 1 x N + wx=None, # M x 4D + bx=None, # 1 x 4D + h0=None, # N x D + c0=None, # N x D + w_h=None, # D x 4D + w_b=None, # 1 x 4D + w_c=None, # 1 x 3D + is_reverse=False, + act_gate=None, + act_cell=None, + act_cand=None): + return lstm( + fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate, + act_cell, act_cand) + + +class TestLstmOp(OpTest): + def set_argument(self): + self.lod = [[2, 3, 2]] + + def setUp(self): + self.op_type = 'fusion_lstm' + self.lod = [[2, 3, 2]] + self.M = 8 + self.D = 16 + self.has_initial_state = False + self.is_reverse = False + self.act_gate = 'sigmoid' + self.act_cell = 'tanh' + self.act_cand = 'tanh' + self.use_peepholes = False + self.set_argument() + + T = sum(self.lod[0]) + bs = len(self.lod[0]) + + x = np.random.normal(size=(T, self.M)).astype('float64') + if self.has_initial_state: + h0 = np.random.normal(size=(bs, self.D)).astype('float64') + c0 = np.random.normal(size=(bs, self.D)).astype('float64') + else: + h0 = np.zeros((bs, self.D)).astype('float64') + c0 = np.zeros((bs, self.D)).astype('float64') + + wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64') + + if self.use_peepholes: + b = np.random.normal(size=(1, 7 * self.D)).astype('float64') + else: + b = np.random.normal(size=(1, 4 * self.D)).astype('float64') + w_b = np.copy(b[:, 0:4 * self.D]) + w_c = b[:, 4 * self.D:] if self.use_peepholes else None + + # this is the weight of fc + wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64') + # this is the bias of fc + # and it should be manually added into the bias of this fusion LSTM + bx = np.random.normal(size=(1, 4 * self.D)).astype('float64') + b[0, 0:4 * self.D] += bx[0, :] + h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c, + self.is_reverse, ACTIVATION[self.act_gate], + ACTIVATION[self.act_cell], ACTIVATION[self.act_cand]) + + self.inputs = { + 'X': (x, self.lod), + 'WeightX': wx, + 'WeightH': wh, + 'Bias': b + } + + if self.has_initial_state: + self.inputs['H0'] = h0 + self.inputs['C0'] = c0 + + self.outputs = { + 'Hidden': (h, self.lod), + 'Cell': (c, self.lod), + } + self.attrs = { + 'use_peepholes': self.use_peepholes, + 'is_reverse': self.is_reverse, + 'gate_activation': self.act_gate, + 'cell_activation': self.act_cell, + 'candidate_activation': self.act_cand + } + + def test_check_output(self): + self.check_output(atol=1e-8) + + +class TestLstmOpInitReverse(TestLstmOp): + def set_argument(self): + self.has_initial_state = True + self.is_reverse = True + + +class TestLstmOpMD1(TestLstmOp): + def set_argument(self): + self.M = 36 + self.D = 8 + + +class TestLstmOpMD2(TestLstmOp): + def set_argument(self): + self.M = 8 + self.D = 8 + + +class TestLstmOpMD3(TestLstmOp): + def set_argument(self): + self.M = 15 + self.D = 3 + + +class TestLstmOpBS1(TestLstmOp): + def set_argument(self): + self.lod = [[3]] + self.D = 16 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 893acd763fb21fc5040b8a24700eb947a9fe37c6..5b96d641d667eee1aa0c7c6019bf92494f777259 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -183,7 +183,9 @@ class TestMNIST(TestParallelExecutorBase): use_parallel_executor=True) self.assertAlmostEquals( - np.mean(parallel_first_loss), single_first_loss, delta=1e-6) + np.mean(parallel_first_loss), + single_first_loss, + delta=1e-6, ) self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss, delta=1e-6) @@ -191,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase): self.check_simple_fc_parallel_accuracy(True) self.check_simple_fc_parallel_accuracy(False) - def check_batchnorm_fc_convergence(self, use_cuda): + def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor): if use_cuda and not core.is_compiled_with_cuda(): return @@ -203,11 +205,13 @@ class TestMNIST(TestParallelExecutorBase): fc_with_batchnorm, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda) + use_cuda=use_cuda, + use_fast_executor=use_fast_executor) def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(True) - self.check_batchnorm_fc_convergence(False) + for use_cuda in (False, True): + for use_fast_executor in (False, True): + self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py index a2a5584459a5d4dc416b9c542a4bb0567982e765..2be8e24a0fae6945351eb767ac924d7ca70848ab 100644 --- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py +++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py @@ -41,7 +41,7 @@ class TestSqueezeOp(OpTest): self.new_shape = (3, 5) def init_attrs(self): - self.attrs = {"axes": self.axes, "inplace": False} + self.attrs = {"axes": self.axes} # Correct: There is mins axis. @@ -68,49 +68,5 @@ class TestSqueezeOp3(TestSqueezeOp): self.new_shape = (3, 5, 1, 4) -# Correct: Inplace. -class TestSqueezeOpInplace1(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (1, 3, 1, 5) - self.axes = (0, 2) - self.new_shape = (3, 5) - - def init_attrs(self): - self.attrs = {"axes": self.axes, "inplace": True} - - -# Correct: Inplace. There is mins axis. -class TestSqueezeOpInplace2(TestSqueezeOp): - def inti_test_case(self): - self.ori_shape = (1, 3, 1, 5) - self.axes = (0, -2) - self.new_shape = (3, 5) - - def init_attrs(self): - self.attrs = {"axes": self.axes, "inplace": True} - - -# Correct: Inplace. No axes input. -class TestSqueezeOpInplace3(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (1, 3, 1, 5) - self.axes = () - self.new_shape = (3, 5) - - def init_attrs(self): - self.attrs = {"axes": self.axes, "inplace": True} - - -# Correct: Inpalce. Just part of axes be squeezed. -class TestSqueezeOpInplace4(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (3, 1, 5, 1, 4, 1) - self.axes = (1, -1) - self.new_shape = (3, 5, 1, 4) - - def init_attrs(self): - self.attrs = {"axes": self.axes, "inplace": True} - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py index 5fcabe4c83457a91ff0e9cd5568904698353b62a..a324438ba5a3c3b57fd956bd11189ef7d50267e2 100644 --- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py +++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py @@ -41,7 +41,7 @@ class TestUnsqueezeOp(OpTest): self.new_shape = (3, 1, 1, 5) def init_attrs(self): - self.attrs = {"axes": self.axes, "inplace": False} + self.attrs = {"axes": self.axes} # Correct: Single input index. @@ -76,38 +76,5 @@ class TestUnsqueezeOp4(TestUnsqueezeOp): self.new_shape = (3, 1, 1, 2, 5, 1) -# Correct: Inplace. -class TestUnsqueezeOpInplace1(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (3, 5) - self.axes = (0, 2) - self.new_shape = (1, 3, 1, 5) - - def init_attrs(self): - self.attrs = {"axes": self.axes, "inplace": True} - - -# Correct: Inplace. There is mins index. -class TestUnsqueezeOpInplace2(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (3, 5) - self.axes = (0, -2) - self.new_shape = (1, 3, 1, 5) - - def init_attrs(self): - self.attrs = {"axes": self.axes, "inplace": True} - - -# Correct: Inplace. There is duplicated axis. -class TestUnsqueezeOpInplace3(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (3, 2, 5) - self.axes = (0, 3, 3) - self.new_shape = (1, 3, 2, 1, 1, 5) - - def init_attrs(self): - self.attrs = {"axes": self.axes, "inplace": True} - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 294308f1877360174c69cc59b0d2037e494985e7..d094647afe1900809fc32cae93f777765f72c675 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -285,11 +285,12 @@ class Trainer(object): self._load_checkpoint() if param_path and os.path.isdir(param_path): - # load params from param_path into scope - io.load_persistables( - executor=exe, - dirname=param_path, - main_program=self.startup_program) + with self._prog_and_scope_guard(): + # load params from param_path into scope + io.load_persistables( + executor=exe, + dirname=param_path, + main_program=self.startup_program) def _transpile_nccl2_dist(self): # PADDLE_TRAINER_IPS