diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 86096d4feaace040da416a01872882456c4098fc..566dc75fda019eb66759eb403f60e16f18cffef1 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,46 +57,43 @@ int main() return 0; }" SSE3_FOUND) -# disable AVX by default on windows -if(NOT WIN32) - # Check AVX - set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) - set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; - }" AVX_FOUND) +# Check AVX +set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; +}" AVX_FOUND) - # Check AVX 2 - set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) - set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; - }" AVX2_FOUND) +# Check AVX 2 +set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; +}" AVX2_FOUND) - # Check AVX512F - set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) - set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; - }" AVX512F_FOUND) -endif(NOT WIN32) +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; +}" AVX512F_FOUND) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 47936a2ac20c6a51063ee7f30c59decdc41342cb..efc31c1b1e64926ea2f8a5ee61510e1ede95c10a 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -7,27 +7,17 @@ function(windows_symbolic TARGET) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) - get_filename_component(src ${src} NAME_WE) - if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) - message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") - endif() - -#only copy the xx.cu to.xx.cu when the content are modified - set(copy_flag 1) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) - if (SOURCE_STR STREQUAL TARGET_STR) - set(copy_flag 0) - endif() - endif() - if (copy_flag) - add_custom_command(OUTPUT .${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" - COMMENT "create hidden file of ${src}.cu") - endif(copy_flag) - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + endif() + + file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) + + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() endfunction() @@ -78,18 +68,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) -cc_test(variable_test SRCS variable_test.cc) - cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash) +cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) +if (WITH_GPU) + target_link_libraries(var_type_traits dynload_cuda) +endif() +cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) + +cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits) cc_library(scope_pool SRCS scope_pool.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope) +cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) nv_test(data_device_transform_test SRCS data_device_transform_test.cu - DEPS operator op_registry device_context math_function) + DEPS operator op_registry device_context math_function scope) if(WITH_GPU) if (WIN32) diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index c9ec5e7a7b37b62efbf3d980e93b5518364d99c9..96a2f9250ff928fe58a5339a25c68c9db515522d 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index abacb11e3b018308c20a67630e3ff34cca7d3387..03fbfd7f24a8a987db72f45be777acc7ece577a6 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() { } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 37b07e5736312b3050debe745f2d3c108469c5d6..15c496130c2b6c7643ff96661be09e5ac4870344 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{1}; + size_t num_iteration_per_drop_scope_{100}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 7e320a08942e4a9a27e6b5c892a993b3a90c43a4..5b9a81811728b7e6c5314738920fd4b5e503ab5c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -42,6 +42,12 @@ namespace { typedef std::vector GraphOps; const char kGraphOps[] = "ops"; +bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) { + return boost::get( + node.Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(role); +} + void PolishGraphToSupportDataHazards(ir::Graph *graph) { for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { @@ -147,6 +153,7 @@ void MultiDevSSAGraphBuilder::Init() const { #endif balance_vars_.resize(places_.size(), 0); + if (strategy_.enable_data_balance_ && places_.size() == 1) { LOG(WARNING) << "It is no need to enable data balance when there is only " "one place. enable_data_balance is set to False."; @@ -154,145 +161,16 @@ void MultiDevSSAGraphBuilder::Init() const { } } -void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, - ir::Node *node, - size_t place_id) const { - auto p = places_[place_id]; - auto *op_handle = result->Get(kGraphOps).back(); - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); - - for (ir::Node *input : node->inputs) { - VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); - op_handle->AddInput(var); - } - - for (ir::Node *output : node->outputs) { - ir::Node *new_node = nullptr; - if (output->Var()) { - new_node = result->CreateVarNode(output->Var()); - } else { - new_node = - result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); - } - CreateOpOutput(result, op_handle, new_node, p, place_id); - } -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( - const std::vector &nodes) const { - std::vector send_vars; - // since parameters are all in block 0, - // it's enough to only scan send ops in block 0 - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find send op, - // instead of the the hard code string - if (op->Type() == "send") { - auto op_vars = op->InputArgumentNames(); - send_vars.reserve(send_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return send_vars; -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainRecvVars( - const std::vector &nodes) const { - std::vector recv_vars; - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find recv op, - // instead of the hard code string - if (op->Type() == "recv") { - auto op_vars = op->OutputArgumentNames(); - recv_vars.reserve(recv_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return recv_vars; -} - -size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( - const std::vector &var_names) const { - int64_t numel_sum = 0; - for (auto var_name : var_names) { - if (all_vars_.find(var_name) == all_vars_.end()) continue; - auto var_desc = all_vars_.at(var_name); - PADDLE_ENFORCE_NOT_NULL(var_desc); - auto dim = framework::make_ddim(var_desc->GetShape()); - int64_t numel = framework::product(dim); - PADDLE_ENFORCE_GT(numel, 0); - numel_sum += numel; - } - - auto smallest = - std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); - size_t dev_id = - static_cast(std::distance(std::begin(balance_vars_), smallest)); - balance_vars_[dev_id] += numel_sum; - return dev_id; -} - -// Topology sort the graph nodes from inputs to outputs. -// Since SSAGraphBuilder depends on forward/backward nodes to assign devices -// to parameter/gradients before optimizer ops, topo sort is insufficient. ( -// some optimizer ops might not depend on any nodes), we manually move all -// optimizer nodes after last backward nodes. -// However, the assumption by SSAGraphBuilder should be relaxed in the future. -std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { - std::vector ret = ir::TopologySortOperations(graph); - size_t last_backward = 0; - for (size_t i = 0; i < ret.size(); ++i) { - if (boost::get( - ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kBackward)) { - last_backward = i; - } - } - - std::vector optimize_ops; - std::vector sorted_ret; - for (size_t i = 0; i < ret.size(); ++i) { - if (i < last_backward) { - if (static_cast(boost::get(ret[i]->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kOptimize))) { - optimize_ops.push_back(ret[i]); - } else { - sorted_ret.push_back(ret[i]); - } - } else if (i == last_backward) { - sorted_ret.push_back(ret[i]); - // Verify that no operations before optimize ops depends on optimize ops. - std::unordered_set optimize_set(optimize_ops.begin(), - optimize_ops.end()); - for (ir::Node *n : sorted_ret) { - for (ir::Node *in : n->inputs) { - for (ir::Node *pre_n : in->inputs) { - PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(), - "optimize operations cannot be depended by forward " - "or backward node %s -> %s", - pre_n->Name(), n->Name()); - } - } - } - sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(), - optimize_ops.end()); - } else { - sorted_ret.push_back(ret[i]); - } - } - return sorted_ret; -} - std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unique_ptr graph) const { Init(); // Give the topology sort order and rebuild the graph structure. - std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); + std::vector sorted_ops = ir::TopologySortOperations(*graph); + + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + sorted_ops = SortForReduceMode(sorted_ops); + } + auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; @@ -303,31 +181,22 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( all_vars_.emplace(node->Name(), node->Var()); } } - std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 result.Set(kGraphVars, new GraphVars(places_.size())); result.Set(kGraphDepVars, new GraphDepVars); result.Set(kGraphOps, new GraphOps); - // find send/recv vars so that we can place the distributed training - // related op in the place 0 - auto send_vars = FindDistTrainSendVars(sorted_ops); - auto recv_vars = FindDistTrainRecvVars(sorted_ops); - std::vector> bcast_var_name_set; bcast_var_name_set.resize(places_.size()); - size_t cur_device_id = 0; bool is_forwarding = true; bool is_dist_train = false; std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { - if (boost::get( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kRPC)) { + if (OpHaveRole(*node, OpRole::kRPC)) { int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device); PADDLE_ENFORCE(op_dev_id != -1, "Can not schedule the RPC operator to the right place."); @@ -341,9 +210,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } is_dist_train = true; - } else if (boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kDist)) { + } else if (OpHaveRole(*node, OpRole::kDist)) { int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device); if (node->Op()->Type() == "concat") { auto origin_param_name = node->Op()->OutputArgumentNames()[0]; @@ -365,7 +232,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); + int op_dev_id = GetOpDeviceID(node, sharded_var_device); if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { @@ -385,47 +252,48 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. - if (static_cast(boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward))) { - try { - auto backward_vars = boost::get>( - node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &p_name = backward_vars[i]; - auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - - switch (strategy_.reduce_) { - case BuildStrategy::ReduceStrategy::kReduce: - cur_device_id = GetAppropriateDeviceID({g_name}); - CreateReduceOp(&result, g_name, cur_device_id); - sharded_var_device.emplace(g_name, cur_device_id); - if (!is_dist_train) { - bcast_var_name_set[cur_device_id].emplace(p_name); - } - break; - case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(g_name)) { - CreateReduceOp(&result, g_name, 0); - CreateBroadcastOp(&result, g_name, 0); - } else { - InsertAllReduceOp(&result, g_name); - } - break; - default: - LOG(FATAL) << "Unknown reduce strategy "; - break; - } + try { + auto backward_vars = boost::get>( + node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + size_t cur_device_id = -1; + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kReduce: + cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(&result, g_name, cur_device_id); + sharded_var_device.emplace(g_name, cur_device_id); + if (!is_dist_train) { + bcast_var_name_set[cur_device_id].emplace(p_name); + } + break; + case BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(g_name)) { + CreateReduceOp(&result, g_name, 0); + CreateBroadcastOp(&result, g_name, 0); + } else { + InsertAllReduceOp(&result, g_name); + } + break; + default: + LOG(FATAL) << "Unknown reduce strategy "; + break; } - } catch (boost::bad_get e) { } + } catch (boost::bad_get e) { } } } @@ -469,12 +337,108 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( return graph; } -bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { - PADDLE_ENFORCE(all_vars_.count(og) != 0); - if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { - return true; +std::vector MultiDevSSAGraphBuilder::SortForReduceMode( + const std::vector &topo_ops) const { + std::unordered_map sharded_var_device; + std::vector sorted_ops; + std::unordered_map> delayed_op; + sorted_ops.reserve(topo_ops.size()); + + auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { + sharded_var_device.emplace(var_name, dev_id); + if (delayed_op.count(var_name)) { + auto &ops = delayed_op.at(var_name); + sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); + delayed_op.at(var_name).clear(); + } + }; + + for (ir::Node *node : topo_ops) { + int op_dev_id = GetOpDeviceID(node, sharded_var_device, &delayed_op); + if (op_dev_id > -1) { + // This op only runs on one specific device. + sorted_ops.emplace_back(node); + for (ir::Node *n : node->outputs) { + insert_delayed_op(n->Name(), op_dev_id); + } + } else if (op_dev_id == -1) { + // This op runs on all devices, and its output may have parameter's + // gradients. + sorted_ops.emplace_back(node); + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + std::vector backward_vars; + try { + backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + } catch (boost::bad_get e) { + } + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &g_name = backward_vars[i + 1]; + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + insert_delayed_op(g_name, static_cast(cur_device_id)); + } + } else if (op_dev_id == -2) { + // The Op on which the Op depends has not yet been generated. + } } - return false; + + PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); + return sorted_ops; +} + +void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, + ir::Node *node, + size_t place_id) const { + auto p = places_[place_id]; + auto *op_handle = result->Get(kGraphOps).back(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + + for (ir::Node *input : node->inputs) { + VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); + op_handle->AddInput(var); + } + + for (ir::Node *output : node->outputs) { + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, place_id); + } +} + +size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + if (all_vars_.find(var_name) == all_vars_.end()) continue; + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; } void MultiDevSSAGraphBuilder::SetCommunicationContext( @@ -625,28 +589,52 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( } int MultiDevSSAGraphBuilder::GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, + ir::Node *node, + const std::unordered_map &sharded_var_device, + std::unordered_map> *delay_ops) const { + if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { + return -1; + } + + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); + + if (dev_id == -1) { + (*delay_ops)[param_grad[1]].push_back(node); + return -2; + } + return dev_id; +} + +int MultiDevSSAGraphBuilder::GetOpDeviceID( + ir::Node *node, const std::unordered_map &sharded_var_device) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } - int op_role = boost::get( - node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); - if (op_role != static_cast(framework::OpRole::kOptimize)) { + + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { return -1; } auto param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device); + int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", node->Op()->Type(), param_grad[0], param_grad[1]); return dev_id; } int MultiDevSSAGraphBuilder::GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, + const std::string &varname, const std::unordered_map &sharded_var_device) const { auto got = sharded_var_device.find(varname); if (got == sharded_var_device.end()) { @@ -740,8 +728,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp( node->Op()->Type() == "split_selected_rows" || node->Op()->Type() == "split_ids") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); + op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { @@ -752,8 +739,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp( sharded_var_device->emplace(varname, op_dev_id); } } else if (node->Op()->Type() == "concat") { - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); + op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); for (auto &varname : output_var_names) { sharded_var_device->emplace(varname, op_dev_id); } @@ -794,8 +780,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device); + op_dev_id = GetVarDeviceID(node->inputs[0]->Name(), *sharded_var_device); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -825,8 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); if (recv_param_grad.size() == 2U) { - op_dev_id = - GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device); + op_dev_id = GetVarDeviceID(recv_param_grad[1], *sharded_var_device); VLOG(10) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] << " place: " << op_dev_id; @@ -861,8 +845,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( for (ir::Node *output : node->outputs) { int outvar_dev_id = op_dev_id; if (node->Op()->Type() == "fetch_barrier") { - outvar_dev_id = - GetVarDeviceID(*result, output->Name(), *sharded_var_device); + outvar_dev_id = GetVarDeviceID(output->Name(), *sharded_var_device); PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); } p = places_[outvar_dev_id]; @@ -879,6 +862,14 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( return op_dev_id; } +bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { + return true; + } + return false; +} + bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { return boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 5736102ddc13418446013307cf8204b677f960dc..7029e9dc18cbacf0c5f0d7c6430d84fb72d6a0a3 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { #endif int GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, + const std::string &varname, const std::unordered_map &sharded_var_device) const; bool IsScaleLossOp(ir::Node *node) const; @@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ir::Graph *result, ir::Node *node, std::unordered_map *sharded_var_device) const; - std::vector FindDistTrainSendVars( - const std::vector &nodes) const; - - std::vector FindDistTrainRecvVars( - const std::vector &nodes) const; - void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; @@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { int dev_id) const; int GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, + ir::Node *node, const std::unordered_map &sharded_var_device) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; @@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void SetCommunicationContext(OpHandleBase *op_handle, const platform::Place &p) const; + std::vector SortForReduceMode( + const std::vector &) const; + + int GetOpDeviceID( + ir::Node *node, + const std::unordered_map &shared_var_device, + std::unordered_map> *delay_ops) + const; + mutable std::string loss_var_name_; mutable std::vector places_; mutable std::vector local_scopes_; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 00b8136dc2ea06cc06ae5b586f6291cb866d950c..1ed4b2c8e860312a88450a0eba9c2de9191f5fe8 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -85,7 +85,6 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( drop_scope_counter_ = 0; } - if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 3dfd14419d94379a0bf79f55d7a139acd77cbd7e..134f759081a0778194c20785e215420d6e2bb622 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) { } else if (var->IsType()) { (*func)(var->GetMutable()); } else { - PADDLE_THROW("Not supported type %s", var->Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var->Type())); } } @@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) { } else if (var.IsType()) { (*func)(var.Get()); } else { - PADDLE_THROW("Not supported type %s", var.Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index da9556c6c1f3468208db02f2958ad6ad137c6566..594fbb48a6d12715fa42494e30bc8f50fbe171ef 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -119,7 +119,7 @@ static void DeleteUnusedTensors( } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } } diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index 23f343f631628b432d91d4504019895ed4bac4a5..c6121777e8d2c32193b5c170bb0fa3f0337c9bc3 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc( const std::string& output) { auto proto = base_desc; framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); desc.SetInput("Bias", {bias}); desc.SetInput("ResidualData", {bias1}); desc.SetAttr("activation", activation); desc.SetOutput("Output", {output}); desc.SetAttr("is_test", true); - + desc.SetAttr("use_cudnn", false); + desc.Flush(); return *desc.Proto(); } std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( std::unique_ptr graph) const { - const std::string pattern_name = "conv_elementwise_add_act_fuse"; + const std::string pattern_name = "conv_elementwise_add2_act_fuse"; FusePassBase::Init(pattern_name, graph.get()); GraphPatternDetector gpd; @@ -76,22 +78,23 @@ std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( framework::OpDesc new_op_desc(new_op_proto, nullptr); // Create a new node for the fused op. - graph->CreateOpNode(&new_op_desc); + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); // Link inputs and outputs. PADDLE_ENFORCE(subgraph.count(x)); auto* conv_in_node = subgraph.at(x); - IR_NODE_LINK_TO(conv_in_node, conv_op); // Input - IR_NODE_LINK_TO(conv_filter, conv_op); // Filter - IR_NODE_LINK_TO(conv_op, conv_out); // Output - IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias - IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), - {conv_op, elementwise_add_op, elementwise_add_op_1, - elementwise_add_out}); + GraphSafeRemoveNodes( + graph.get(), + {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out, elementwise_add_out_1, act_op}); }; gpd(graph.get(), handler); return graph; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8e67f8f6104cdaa8227c34351c6355ab8fc81b09..3eb5bdba3b7275f45cdfc6ad47f75e7a423541d0 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -20,102 +20,11 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" -DEFINE_bool(enforce_when_check_program, true, - "Checking whether the program is correct or not. We will log " - "errors rather than throwing exceptions if this flag turned off"); - namespace paddle { namespace framework { namespace ir { -namespace { - -void CheckProgram(const ProgramDesc &program) { -#define _INT(role) static_cast(role) - - std::map visit; - for (OpDesc *op : program.Block(0).AllOps()) { - // For backward compatibility, some program doesn't have role added. - if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; - int role_id = - boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - visit[role_id] = true; - switch (role_id) { - case _INT(OpRole::kForward): - if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) << "Cannot add backward operator before forward operator " - << op->Type(); - } - break; - case _INT(OpRole::kBackward): - case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): - if (!FLAGS_enforce_when_check_program) { - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add backward operator %s after optimize operator.", - op->Type()); - } else { - if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator %s after optimize operator." - << op->Type(); - } - } - break; - case _INT(OpRole::kForward) | _INT(OpRole::kLoss): - if (!FLAGS_enforce_when_check_program) { - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | - _INT(OpRole::kLoss)) == visit.end(), - "Cannot add backward|loss operator before " - "forward|loss operator %s.", - op->Type()); - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add forward|loss operator %s after optimize operator.", - op->Type()); - } else { - if (visit.find(_INT(OpRole::kBackward) | _INT(OpRole::kLoss)) != - visit.end()) { - LOG(ERROR) << "Cannot add backward|loss operator before " - << "forward|loss operator %s." << op->Type(); - } - - if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { - LOG(ERROR) << "Cannot add forward|loss operator %s after optimize " - "operator." - << op->Type(); - } - } - break; - case _INT(OpRole::kOptimize): - case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): - if (!FLAGS_enforce_when_check_program) { - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), - "Optimize operators %s must follow backward operator.", - op->Type()); - } else { - if (visit.find(_INT(OpRole::kBackward)) == visit.end()) { - LOG(ERROR) << "Optimize operators %s must follow backward operator." - << op->Type(); - } - } - break; - case _INT(OpRole::kLRSched): - case _INT(OpRole::kDist): - case _INT(OpRole::kRPC): - case _INT(OpRole::kNotSpecified): - break; - default: - LOG(FATAL) << "Unknown operator role. Don't add new role because " - "you don't know what you are doing."; - } - } - -#undef _INT -} -} // namespace Graph::Graph(const ProgramDesc &program) : program_(program) { - CheckProgram(program_); auto var_nodes = InitFromProgram(program_); ResolveHazard(var_nodes); } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 13d752e5167c039ec8d9e4300b190a726bb02a63..73d1a3da8fc921eea77a2bddbe1cb63bd9832ea3 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } -std::unordered_set conv_act_set({"identity", "sigmoid", "relu", - "relu6", "relux", "tanh", - "band_pass"}); +std::unordered_set conv_act_set({"identity", "relu"}); PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { conv_in->AsInput(); @@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { ->AsInput(); auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) ->assert_is_op_output("elementwise_add") - ->assert_is_op_input("elementwise_add", "X") + ->assert_is_op_input("elementwise_add", "Y") ->AsIntermediate(); auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) ->assert_is_op("elementwise_add"); auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) - ->assert_is_op_input("elementwise_add", "Y") + ->assert_is_op_input("elementwise_add", "X") ->AsInput(); auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) ->assert_is_op_output("elementwise_add") @@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) .LinksTo({elementwise_add_out}); - elementwise_add_op_1->LinksFrom( - {elementwise_add_out, elementwise_add_in_y_1}); + elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1}) + .LinksTo({elementwise_add_out_1}); act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); return act_out; } diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 6940250c3f9663bbb734d5a6eb78135aecbc3a3b..c3a044d22cf04dceecc164fae934ee15c4563af1 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable memory::AllocationPtr gpu_; + mutable paddle::memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 2311614c335a56501ac777d787f6653659294765..ca31303f77c4a30eb64c43404e214779ea78aeaf 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") .SetDefault(""); - AddAttr>(OpCreationCallstackAttrName(), - "Callstack for Op Creatation.") - .SetDefault({}); - Validate(); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 0a0f8f4655bc34cdb25205ff6eaec9f96c801ebd..4c59c73d8779eceb267ad532aabccabbd54b0df2 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker { static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpNamescopeAttrName() { return "op_namescope"; } - static const char *OpCreationCallstackAttrName() { return "op_callstack"; } void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 6d39bb3c524b4725dfebd6ef07594b0b45c65463..2c1648c81fc999c6306d5b08bc243f3ad21fec04 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,7 +23,8 @@ limitations under the License. */ #include #include -#include "glog/logging.h" // For VLOG() +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2b2764db01257208002e0efd4a71cc10850c4260..f659a7b93fe06341b3c94aa07c526c982ff54483 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -16,15 +16,10 @@ limitations under the License. */ #include #include -#include -#include -#include -#include "gflags/gflags.h" -#include "glog/logging.h" + #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/transfer_scope_cache.h" @@ -162,59 +157,27 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { - try { - if (VLOG_IS_ON(4)) { - VLOG(4) << place << " " << DebugStringEx(&scope); - } - if (platform::is_gpu_place(place)) { + VLOG(4) << place << " " << DebugStringEx(&scope); + if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("Cannot run operator on place %s", place); + PADDLE_THROW("Cannot run operator on place %s", place); #else - auto dev_id = boost::get(place).device; - platform::SetDeviceId(dev_id); + auto dev_id = boost::get(place).device; + platform::SetDeviceId(dev_id); #endif - } - - // The profile has a process-wide mutex, results in serious performance - // issue - // in concurrency scenerio. Here use an `if` to fix this issue. - // Please not remove the `if`, ask @Superjomn if there are any concern. - if (platform::IsProfileEnabled()) { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - RunImpl(scope, place); - } else { - RunImpl(scope, place); - } - - if (VLOG_IS_ON(3)) { - VLOG(3) << place << " " << DebugStringEx(&scope); - } - } catch (platform::EnforceNotMet exception) { - if (Attrs().count("sub_block") != 0) { - throw exception; - } - - auto& callstack = Attr>( - OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + } - if (callstack.empty()) { - throw exception; - } - std::ostringstream sout; - sout << "Invoke operator " << Type() << " error.\n"; - sout << "Python Callstacks: \n"; - for (auto& line : callstack) { - sout << line; - } - sout << "C++ Callstacks: \n"; - sout << exception.err_str_; - exception.err_str_ = sout.str(); - throw exception; - } catch (...) { - std::rethrow_exception(std::current_exception()); + // The profile has a process-wide mutex, results in serious performance issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + } else { + RunImpl(scope, place); } + VLOG(3) << place << " " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { @@ -417,7 +380,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { return &(var.Get().value()); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var.Type().name()); + ToTypeName(var.Type())); } } @@ -428,7 +391,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) { return var->GetMutable()->mutable_value(); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -522,7 +485,7 @@ const std::vector ExecutionContext::MultiInput( PADDLE_ENFORCE( var->IsType(), "should be LoDTensor, but the received type is %s", - var->Type().name()); + ToTypeName(var->Type())); return &(var->Get()); }); return res; @@ -541,7 +504,7 @@ const std::vector ExecutionContext::LegacyMultiInput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return &(var->Get()); }); return res; @@ -570,7 +533,7 @@ std::vector ExecutionContext::MultiOutput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return var->GetMutable(); }); return res; @@ -812,7 +775,7 @@ class RuntimeInferShapeContext : public InferShapeContext { PADDLE_THROW( "Only LoDTensor/SelectedRows support 'GetDim', but Variables " "type_id is %s.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -835,7 +798,7 @@ class RuntimeInferShapeContext : public InferShapeContext { var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + ToTypeName(var->Type())); } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1fe2daacf1369902cde732422b4e65c3d156250f..bad9716e8b40916dabb5914277c3663532a3e964 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@"; /// e.g. Variable "x@GRAD" is the gradient of varibale "x". constexpr char kGradVarSuffix[] = "@GRAD"; +constexpr size_t kGradVarSuffixSize = 5U; + /// Variables with this suffix are supposed to be filled up with zeros. constexpr char kZeroVarSuffix[] = "@ZERO"; @@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; extern std::vector> kKernelPriority; inline std::string GradVarName(const std::string& var_name) { - return var_name + kGradVarSuffix; + std::string result; + result.reserve(var_name.size() + kGradVarSuffixSize); + result += var_name; + result += kGradVarSuffix; + return result; } proto::VarType::Type GetDataTypeOfVar(const Variable* var); @@ -110,8 +116,8 @@ class OperatorBase { bool HasAttr(const std::string& name) const { return attrs_.count(name); } template inline const T& Attr(const std::string& name) const { - PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", - name); + PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(), + "%s should be in AttributeMap", name); return boost::get(attrs_.at(name)); } const AttributeMap& Attrs() const { return attrs_; } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a921f469f5e0276884fe194c99b15100a11113dc..e14b74a87302a92de7724f3822859026a44b13d0 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -320,6 +320,7 @@ void ParallelExecutor::BCastParamsToDevices( if (paddle::platform::is_gpu_place(main_tensor.place())) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::vector buffers; + buffers.reserve(member_->places_.size()); size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); for (size_t i = 0; i < member_->places_.size(); ++i) { @@ -353,9 +354,7 @@ void ParallelExecutor::BCastParamsToDevices( #endif } else { platform::CPUPlace cpu; - for (size_t i = 0; i < member_->places_.size(); ++i) { - if (i == 0) continue; - + for (size_t i = 1; i < member_->places_.size(); ++i) { auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 4f79d9826099e8cb19b6a21a7723c75baa1745f9..750b626603178d2d2360c74b7b6530fa7cfe47b0 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -47,15 +47,9 @@ DEFINE_bool(fast_eager_deletion_mode, false, // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_KIDS_READER_LOCK -#define SCOPE_KIDS_WRITER_LOCK -#define SCOPE_VARS_READER_LOCK -#define SCOPE_VARS_WRITER_LOCK +#define SCOPE_LOCK_GUARD #else -#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); -#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); -#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); -#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); +#define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_); #endif namespace paddle { @@ -73,69 +67,64 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - Scope* child = new Scope(this); - { - SCOPE_KIDS_WRITER_LOCK - kids_.push_back(child); - } - return *child; + SCOPE_LOCK_GUARD + kids_.push_back(new Scope(this)); + return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - SCOPE_VARS_WRITER_LOCK + SCOPE_LOCK_GUARD return VarInternal(name); } Variable* Scope::Var(std::string* name) { + SCOPE_LOCK_GUARD auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; } - SCOPE_VARS_WRITER_LOCK return VarInternal(new_name); } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_VARS_READER_LOCK + SCOPE_LOCK_GUARD return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_VARS_READER_LOCK + SCOPE_LOCK_GUARD return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_VARS_READER_LOCK + SCOPE_LOCK_GUARD return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_KIDS_WRITER_LOCK + SCOPE_LOCK_GUARD for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_KIDS_READER_LOCK + SCOPE_LOCK_GUARD auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { + SCOPE_LOCK_GUARD std::vector known_vars; - { - SCOPE_VARS_READER_LOCK - known_vars.reserve(this->vars_.size()); - for (auto& p : vars_) { - known_vars.emplace_back(p.first); - } + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); } return known_vars; } void Scope::DeleteScope(Scope* scope) const { - SCOPE_KIDS_WRITER_LOCK + SCOPE_LOCK_GUARD auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -149,8 +138,8 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { + SCOPE_LOCK_GUARD std::set var_set(var_names.begin(), var_names.end()); - SCOPE_VARS_WRITER_LOCK for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { it = vars_.erase(it); @@ -162,12 +151,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_VARS_WRITER_LOCK + SCOPE_LOCK_GUARD RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_VARS_WRITER_LOCK + SCOPE_LOCK_GUARD auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; @@ -176,11 +165,9 @@ std::string Scope::Rename(const std::string& origin_name) const { Variable* Scope::VarInternal(const std::string& name) { auto* v = FindVarLocally(name); if (v != nullptr) return v; - v = new Variable(); - vars_[name].reset(v); + vars_.emplace(name, std::unique_ptr(v)); VLOG(3) << "Create variable " << name; - v->name_ = &(vars_.find(name)->first); return v; } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 797d110159391482929f2cea2d9455132619c78b..794b8e4c9417fb62bfccdf5c610942f95250d1d4 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,19 +14,12 @@ limitations under the License. */ #pragma once -extern "C" { -#include -} - -#include #include -#include +#include // NOLINT #include #include -#include #include -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -138,8 +131,7 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable RWLock kids_lock_; - mutable RWLock vars_lock_; + mutable std::mutex mutex_; }; // Generate some debug string about the inherience structure of scope, quite diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 3b6f1cdb8f24ab20bfc80eeeba88891d7b41d1f9..73be446f71f193bea203c986b482e6b98a9826c5 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -19,52 +19,50 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/variable.h" namespace paddle { namespace framework { template -inline bool IsType(const std::type_index& type_index) { - return type_index == std::type_index(typeid(T)); +inline bool IsType(const std::type_index& type) { + return type == typeid(T); } -inline proto::VarType::Type ToVarType(std::type_index type) { - if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_RANK_TABLE; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR_ARRAY; - } else if (IsType(type)) { - return proto::VarType_Type_SELECTED_ROWS; - } else if (IsType(type)) { - return proto::VarType_Type_READER; - } else { - PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); +inline proto::VarType::Type ToVarType(int type) { + switch (type) { + case proto::VarType::LOD_TENSOR: + case proto::VarType::SELECTED_ROWS: + case proto::VarType::LOD_RANK_TABLE: + case proto::VarType::LOD_TENSOR_ARRAY: + case proto::VarType::READER: + return static_cast(type); + default: + PADDLE_THROW("ToVarType:Unsupported type %d", type); } } template inline void VisitVarType(const framework::Variable& var, Visitor visitor) { - switch (ToVarType(var.Type())) { - case proto::VarType_Type_LOD_TENSOR: + switch (var.Type()) { + case proto::VarType::LOD_TENSOR: visitor(var.Get()); return; - case proto::VarType_Type_LOD_RANK_TABLE: + case proto::VarType::LOD_RANK_TABLE: visitor(var.Get()); return; - case proto::VarType_Type_LOD_TENSOR_ARRAY: + case proto::VarType::LOD_TENSOR_ARRAY: visitor(var.Get()); return; - case proto::VarType_Type_SELECTED_ROWS: + case proto::VarType::SELECTED_ROWS: visitor(var.Get()); return; - case proto::VarType_Type_READER: + case proto::VarType::READER: visitor(var.Get()); return; default: - PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); + PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc index 7842168f603885ce7dc87d2a01dfa4f544389faa..2a75394fca719196a9d53894b080598e942baa45 100644 --- a/paddle/fluid/framework/var_type_inference_test.cc +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) { op->InferVarType(prog.MutableBlock(0)); - ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, + ASSERT_EQ(proto::VarType::LOD_TENSOR, prog.MutableBlock(0)->Var("test2_out")->GetType()); } diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc new file mode 100644 index 0000000000000000000000000000000000000000..c3c5bab23b92a0274cf786ea2f18d8246706162f --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/platform/macros.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +// Besides registering variable type id, it is helpful to register a +// var_id -> std::type_index map (for example, get type names according to id) +namespace detail { + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *id_to_type, MapType2 *type_to_id) { + using Type = + typename std::tuple_element::type; + static_assert(!std::is_same::value, "Type cannot be void"); + constexpr int kId = VarTypeTrait::kId; + auto type = std::type_index(typeid(Type)); + PADDLE_ENFORCE(id_to_type->count(kId) == 0, + "Registered duplicate type id %d for type %s", kId, + type.name()); + PADDLE_ENFORCE(type_to_id->count(type) == 0, + "Registered duplicate type_index %s for id %d", type.name(), + kId); + id_to_type->emplace(kId, type); + type_to_id->emplace(type, kId); + VarIdToTypeIndexMapInitializerImpl::Init(id_to_type, + type_to_id); + } +}; + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *, MapType2 *) {} +}; + +// VarIdToTypeIndexMapInitializer is designed to initialize var_id -> +// std::type_index map and std::type_index -> var_id map +using VarIdToTypeIndexMapInitializer = + VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum, + VarTypeRegistry::kRegisteredTypeNum == + 0>; + +struct VarIdToTypeIndexMapHolder { + DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder); + + public: + static const std::type_index &ToTypeIndex(int var_id) { + auto it = Instance().id_to_type_map_.find(var_id); + PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(), + "VarId %d is not registered.", var_id); + return it->second; + } + + static int ToTypeId(const std::type_index &type) { + auto it = Instance().type_to_id_map_.find(type); + PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(), + "VarType %s is not registered.", type.name()); + return it->second; + } + + private: + VarIdToTypeIndexMapHolder() { + VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_); + } + + static const VarIdToTypeIndexMapHolder &Instance() { + static const VarIdToTypeIndexMapHolder instance; + return instance; + } + + std::unordered_map id_to_type_map_; + std::unordered_map type_to_id_map_; +}; + +} // namespace detail + +const std::type_index &ToTypeIndex(int var_id) { + return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); +} + +const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } + +int ToTypeId(const std::type_index &type) { + return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h new file mode 100644 index 0000000000000000000000000000000000000000..cc68cf2ab8e1bbc8a57cf97a2084610440a75f85 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.h @@ -0,0 +1,195 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include +#ifndef _WIN32 +#include +#endif +#endif + +// Users should add forward declarations here +namespace paddle { + +namespace platform { +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +class Communicator; +#endif +#endif +} // namespace platform + +namespace framework { +class Tensor; +class LoDTensor; +class SelectedRows; +class LoDRankTable; +class ReaderHolder; +class Scope; +} // namespace framework + +namespace operators { +template +class AlgorithmsCache; + +class CudnnRNNCache; + +namespace reader { +class LoDTensorBlockingQueueHolder; +} // namespace reader +} // namespace operators + +} // namespace paddle + +namespace paddle { +namespace framework { + +const char *ToTypeName(int var_id); +const std::type_index &ToTypeIndex(int var_id); +int ToTypeId(const std::type_index &type); + +namespace detail { + +template +struct TypePosFinderImpl { + static constexpr int kPos = + std::is_same::value + ? kStart + : TypePosFinderImpl::kPos; +}; + +template +struct TypePosFinderImpl { + static constexpr int kPos = std::is_same::value ? kStart : -1; +}; + +// TypePosFinder helps to find the position in which T is inside Args... +// If T is not inside Args..., kPos would be -1 +template +struct TypePosFinder { + static constexpr int kPos = + TypePosFinderImpl::kPos; +}; + +template +struct VarTypeRegistryImpl { + static constexpr size_t kRegisteredTypeNum = sizeof...(Args); + using ArgTuple = std::tuple; + + // TypePos() returns the position in which T is inside Args... + // If T is not inside Args..., return -1 + template + static constexpr int TypePos() { + return TypePosFinder::kPos; + } + + // IsRegistered() returns whether T is registered inside RegistryImpl + template + static constexpr bool IsRegistered() { + return TypePos() >= 0; + } +}; + +} // namespace detail + +#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \ + template <> \ + struct VarTypeTrait { \ + static_assert(VarTypeRegistry::IsRegistered(), \ + "Must be registered type"); \ + using Type = type; \ + static constexpr int kId = static_cast(proto_id); \ + } + +/** + * The following codes are designed to register variable types. + * Only registered types can be stored in Variable. + * This registry mechanism is designed to speed up Variable. + * + * Caution: If you want to add more var types, please consider carefully + * whether you really need to add it. + */ + +// Users should add other variable types below. +// Paddle would generate unique Ids for each registered variable types. +using VarTypeRegistry = detail::VarTypeRegistryImpl< + Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, + LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, + std::map, operators::reader::LoDTensorBlockingQueueHolder, +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 + ncclUniqueId, platform::Communicator, +#endif + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::CudnnRNNCache, +#endif + int, float>; + +template +struct VarTypeTrait { + static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); + using Type = T; + /** + * Unique VarType Id generation. + * + * The auto-generated id should not be the same as any protobuf id defined in + * framework.proto. Therefore, we generate id by adding the type pos and + * maximum protobuf id (i.e., proto::VarType::TUPLE). + * + * However, we may need more protobuf id in the future. + * To avoid changing this auto id generation algorithm frequently, we + * generate id by adding the type pos and twice of maximum protobuf id (i.e., + * proto::VarType::TUPLE). + */ + static constexpr int kId = VarTypeRegistry::TypePos() + + static_cast(proto::VarType::TUPLE) * 2; +}; + +// Users should set some of variable type ids to be what is defined in +// framework.proto below +REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR); +REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS); +REG_PROTO_VAR_TYPE_TRAIT(std::vector, proto::VarType::STEP_SCOPES); +REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); +REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); +REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); +REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); +REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); + +/** End of variable type registration */ + +template +inline constexpr bool IsRegisteredVarType() { + return VarTypeRegistry::IsRegistered(); +} + +#undef REG_PROTO_VAR_TYPE_TRAIT +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..00840d634d802cfe17fbff127a75606cb5e2cf79 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *var_id_set, SetType2 *type_index_set) { + using Type = + typename std::tuple_element::type; + static_assert(std::is_same::Type, Type>::value, + "Type must be the same"); + constexpr auto kId = VarTypeTrait::kId; + std::type_index actual_type(typeid(Type)); + EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); + EXPECT_EQ(ToTypeIndex(kId), actual_type); + EXPECT_EQ(ToTypeId(actual_type), kId); + EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type); + EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId); + + EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT + EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT + var_id_set->insert(kId); + type_index_set->insert(std::type_index(typeid(Type))); + TypeIndexChecker::Check(var_id_set, + type_index_set); + } +}; + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *, SetType2 *) {} +}; + +TEST(var_type_traits, check_no_duplicate_registry) { + constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum; + std::unordered_set var_id_set; + std::unordered_set type_index_set; + TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check( + &var_id_set, &type_index_set); +} + +template +bool CheckVarId(int proto_id) { + static_assert(std::is_same::Type, T>::value, + "Type must be the same"); + return VarTypeTrait::kId == proto_id; +} + +TEST(var_type_traits, check_proto_type_id) { + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR)); + ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); + ASSERT_TRUE(CheckVarId>(proto::VarType::STEP_SCOPES)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_RANK_TABLE)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR_ARRAY)); + ASSERT_TRUE(CheckVarId(proto::VarType::PLACE_LIST)); + ASSERT_TRUE(CheckVarId(proto::VarType::READER)); + ASSERT_TRUE(CheckVarId(proto::VarType::INT32)); + ASSERT_TRUE(CheckVarId(proto::VarType::FP32)); + + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR); + ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS); + ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES); + ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE); + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY, + proto::VarType::LOD_TENSOR_ARRAY); + ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST); + ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER); + ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH); + ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST); + ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW); + ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE); + ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32); + ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32); +} + +TEST(var_type_traits, test_registry) { + using Registry = detail::VarTypeRegistryImpl; + ASSERT_TRUE(Registry::TypePos() == 0); + ASSERT_TRUE(Registry::TypePos() == 1); + ASSERT_TRUE(Registry::TypePos() == 2); + ASSERT_TRUE(Registry::TypePos() == 3); + ASSERT_TRUE(Registry::TypePos() == -1); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 873e1b20a584df3ba90cf5c1a62a3879bf98ce5c..b9d07da822cf1eb42859e1d7d84437582fada8ff 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -18,7 +18,7 @@ #include #include -#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/framework/var_type_traits.h" namespace paddle { namespace framework { @@ -27,10 +27,14 @@ class Variable { public: template const T& Get() const { + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing"); - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); return *static_cast(holder_->Ptr()); } @@ -39,61 +43,61 @@ class Variable { template T* GetMutable() { if (!holder_) { - holder_.reset(new PlaceholderImpl(new T())); + holder_.reset(new PlaceholderImpl()); } else { - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); } return static_cast(holder_->Ptr()); } template bool IsType() const { - return holder_ != nullptr && - std::type_index(typeid(T)) == std::type_index(holder_->Type()); + return holder_ && holder_->Type() == VarTypeTrait::kId; } void Clear() { holder_.reset(); } - std::type_index Type() const { + int Type() const { PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory"); return holder_->Type(); } private: struct Placeholder { - virtual ~Placeholder() {} - virtual const std::type_info& Type() const = 0; - virtual void* Ptr() const = 0; + virtual ~Placeholder() = default; + + inline int Type() const { return type_; } + inline const void* Ptr() const { return ptr_; } + inline void* Ptr() { return ptr_; } + + protected: + inline void Init(void* p, int type) { + ptr_ = p; + type_ = type; + } + + void* ptr_; + int type_; }; // Placeholder hides type T, so it doesn't appear as a template // parameter of Variable. template struct PlaceholderImpl : public Placeholder { - explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} - - virtual const std::type_info& Type() const { return type_; } - virtual void* Ptr() const { return static_cast(ptr_.get()); } + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); + PlaceholderImpl() { this->Init(&obj_, VarTypeTrait::kId); } - std::unique_ptr ptr_; - const std::type_info& type_; + private: + T obj_; }; - std::unique_ptr - holder_; // pointers to a PlaceholderImpl object indeed. - - // name_ is only meaningful with a Scope and accessible by it. - // - // NOTE: Please don't expose name_ by adding methods like - // Variable::Name or Scope::VarName! A variable could have a human - // readable name or an auto-generated scope-unique name. In the - // former case, the caller knows the name and doesn't need to access - // the name; in the latter case, the variable should be identified - // by its address but not the unreadable name. - friend class Scope; - const std::string* name_; + // pointers to a PlaceholderImpl object indeed. + std::unique_ptr holder_; }; } // namespace framework diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index 003dcfd3dfe5ecfd563a686bb72b061aff602f73..511c9c52146ece4b90905cc9d49565103589c1ec 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -16,27 +16,28 @@ #include #include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" -TEST(Variable, GetMutable) { - using paddle::framework::Variable; - - struct Tensor { - int content_; - }; +namespace paddle { +namespace framework { +TEST(Variable, GetMutable) { std::unique_ptr v(new Variable()); - Tensor* t = v->GetMutable(); - t->content_ = 1234; + auto* t = v->GetMutable(); + *t = "1234"; - const Tensor& tt = v->Get(); - EXPECT_EQ(1234, tt.content_); + const auto& tt = v->Get(); + EXPECT_EQ("1234", tt); try { - v->GetMutable(); + v->GetMutable(); } catch (std::exception& e) { return; } EXPECT_TRUE(false); } + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index cb88333d1570322fbac7112755bab5e11c97201a..f84e1ab6b827b3b96d0a503394d95b06ed25a3d2 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -69,17 +69,17 @@ void TestWord2vecPrediction(const std::string& model_path) { std::vector outputs; CHECK(predictor->Run(slots, &outputs)); - PADDLE_ENFORCE(outputs.size(), 1UL); + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); // Check the output buffer size and result of each tid. - PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL); float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(static_cast(5UL), num_elements); i++) { - LOG(INFO) << "data: " - << static_cast(outputs.front().data.data())[i]; + LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i] + << " result: " << result[i]; PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], result[i]); } diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc index 569a487328e2f1febe2ca5014b232dbd51d28079..03c2aa3fb8094ce2996f513b90589de0ef903ae8 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.cc +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { // TODO(Superjomn) should avoid the case when a TensorArray is a // parameter. if (var_name == "feed" || var_name == "fetch") continue; - if (var->Type() == typeid(framework::LoDTensorArray)) { + if (var->IsType()) { VLOG(4) << "collect " << var_name; arrays_.push_back(var->GetMutable()); } diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index 6a5ea64de66fcac44117d0d8f7798e8875703ec6..213c6891d0e2320689c8c69266d40611f295edc8 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -27,8 +27,11 @@ namespace details { // training phase. struct TensorArrayBatchCleaner { TensorArrayBatchCleaner() { - valid_types_.insert(typeid(framework::Tensor)); - valid_types_.insert(typeid(framework::LoDTensor)); + constexpr auto kTensorId = framework::VarTypeTrait::kId; + constexpr auto kLoDTensorId = + framework::VarTypeTrait::kId; + valid_types_.insert(kTensorId); + valid_types_.insert(kLoDTensorId); } // Collect the variables that are not Tensor or LoDTensor, and reset them to a // bool(trick), because some of them are containers, and some operators just @@ -46,7 +49,7 @@ struct TensorArrayBatchCleaner { bool no_tensor_flag_{true}; std::vector arrays_; - std::unordered_set valid_types_; + std::unordered_set valid_types_; std::unordered_set no_tensor_vars_; }; diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 9a393a61c4b4586009a022884e88f3f5c6392ed3..7830e859567747e6c05686335919e8346f76320d 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -113,6 +113,16 @@ static void TensorAssignData(PaddleTensor *tensor, } } +template +static void TensorAssignData(PaddleTensor *tensor, + const std::vector> &data, + const std::vector &lod) { + int size = lod[lod.size() - 1]; + tensor->shape.assign({size, 1}); + tensor->lod.assign({lod}); + TensorAssignData(tensor, data); +} + template static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, const std::vector> &data) { diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 142801382b4fdeaa63f51390b63cf6db6cb8f60d..2213971c1764b1a0bddfce5830bbdf2ffedd61ee 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -98,10 +98,8 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, auto one_batch = data->NextBatch(); PaddleTensor input_tensor; input_tensor.name = "word"; - input_tensor.shape.assign({static_cast(one_batch.data.size()), 1}); - input_tensor.lod.assign({one_batch.lod}); input_tensor.dtype = PaddleDType::INT64; - TensorAssignData(&input_tensor, {one_batch.data}); + TensorAssignData(&input_tensor, {one_batch.data}, one_batch.lod); PADDLE_ENFORCE_EQ(batch_size, static_cast(one_batch.lod.size() - 1)); input_slots->assign({input_tensor}); } diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 8aaab6d6649e1d4b6db7695df0e9dd219c89422c..9d3c7519430522878ace697ea5ed38aebb6b0855 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -19,11 +19,9 @@ namespace inference { using contrib::AnalysisConfig; struct DataRecord { - std::vector> query_data_all, title_data_all; + std::vector> query, title; std::vector lod1, lod2; - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,22 +31,9 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= query_data_all.size()) { - data.query_data_all.assign(query_data_all.begin() + batch_iter, - query_data_all.begin() + batch_end); - data.title_data_all.assign(title_data_all.begin() + batch_iter, - title_data_all.begin() + batch_end); - // Prepare LoDs - data.lod1.push_back(0); - data.lod2.push_back(0); - CHECK(!data.query_data_all.empty()); - CHECK(!data.title_data_all.empty()); - CHECK_EQ(data.query_data_all.size(), data.title_data_all.size()); - for (size_t j = 0; j < data.query_data_all.size(); j++) { - // calculate lod - data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size()); - data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size()); - } + if (batch_end <= query.size()) { + GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end); } batch_iter += batch_size; return data; @@ -67,8 +52,8 @@ struct DataRecord { // load title data std::vector title_data; split_to_int64(data[1], ' ', &title_data); - query_data_all.push_back(std::move(query_data)); - title_data_all.push_back(std::move(title_data)); + query.push_back(std::move(query_data)); + title.push_back(std::move(title_data)); } num_samples = num_lines; } @@ -80,15 +65,9 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_query_tensor.name = "left"; lod_title_tensor.name = "right"; auto one_batch = data->NextBatch(); - int size1 = one_batch.lod1[one_batch.lod1.size() - 1]; // token batch size - int size2 = one_batch.lod2[one_batch.lod2.size() - 1]; // token batch size - lod_query_tensor.shape.assign({size1, 1}); - lod_query_tensor.lod.assign({one_batch.lod1}); - lod_title_tensor.shape.assign({size2, 1}); - lod_title_tensor.lod.assign({one_batch.lod2}); // assign data - TensorAssignData(&lod_query_tensor, one_batch.query_data_all); - TensorAssignData(&lod_title_tensor, one_batch.title_data_all); + TensorAssignData(&lod_query_tensor, one_batch.query, one_batch.lod1); + TensorAssignData(&lod_title_tensor, one_batch.title, one_batch.lod2); // Set inputs. input_slots->assign({lod_query_tensor, lod_title_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index f19a2ed59ef2f666393124323ffee2f1e79ccf06..f8635968cebc490cf156162efaef4bbbab9dcc3e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -19,11 +19,9 @@ namespace inference { using contrib::AnalysisConfig; struct DataRecord { - std::vector> word_data_all, mention_data_all; + std::vector> word, mention; std::vector lod; // two inputs have the same lod info. - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,20 +31,10 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= word_data_all.size()) { - data.word_data_all.assign(word_data_all.begin() + batch_iter, - word_data_all.begin() + batch_end); - data.mention_data_all.assign(mention_data_all.begin() + batch_iter, - mention_data_all.begin() + batch_end); - // Prepare LoDs - data.lod.push_back(0); - CHECK(!data.word_data_all.empty()); - CHECK(!data.mention_data_all.empty()); - CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size()); - for (size_t j = 0; j < data.word_data_all.size(); j++) { - // calculate lod - data.lod.push_back(data.lod.back() + data.word_data_all[j].size()); - } + if (batch_end <= word.size()) { + GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end); + GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter, + batch_end); } batch_iter += batch_size; return data; @@ -65,8 +53,8 @@ struct DataRecord { // load mention data std::vector mention_data; split_to_int64(data[3], ' ', &mention_data); - word_data_all.push_back(std::move(word_data)); - mention_data_all.push_back(std::move(mention_data)); + word.push_back(std::move(word_data)); + mention.push_back(std::move(mention_data)); } num_samples = num_lines; } @@ -78,14 +66,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_word_tensor.name = "word"; lod_mention_tensor.name = "mention"; auto one_batch = data->NextBatch(); - int size = one_batch.lod[one_batch.lod.size() - 1]; // token batch size - lod_word_tensor.shape.assign({size, 1}); - lod_word_tensor.lod.assign({one_batch.lod}); - lod_mention_tensor.shape.assign({size, 1}); - lod_mention_tensor.lod.assign({one_batch.lod}); // assign data - TensorAssignData(&lod_word_tensor, one_batch.word_data_all); - TensorAssignData(&lod_mention_tensor, one_batch.mention_data_all); + TensorAssignData(&lod_word_tensor, one_batch.word, one_batch.lod); + TensorAssignData(&lod_mention_tensor, one_batch.mention, + one_batch.lod); // Set inputs. input_slots->assign({lod_word_tensor, lod_mention_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index f5082cd60f1ae4e4eaf9dbe59a446ace900ee456..e6d6cd2960b394e8cd20b473bed90ce511f806be 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -18,12 +18,9 @@ namespace paddle { namespace inference { struct DataRecord { - std::vector> title1_all, title2_all, title3_all, l1_all; std::vector> title1, title2, title3, l1; - std::vector title1_lod, title2_lod, title3_lod, l1_lod; - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + std::vector lod1, lod2, lod3, l1_lod; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,41 +30,11 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= title1_all.size()) { - data.title1_all.assign(title1_all.begin() + batch_iter, - title1_all.begin() + batch_end); - data.title2_all.assign(title2_all.begin() + batch_iter, - title2_all.begin() + batch_end); - data.title3_all.assign(title3_all.begin() + batch_iter, - title3_all.begin() + batch_end); - data.l1_all.assign(l1_all.begin() + batch_iter, - l1_all.begin() + batch_end); - // Prepare LoDs - data.title1_lod.push_back(0); - data.title2_lod.push_back(0); - data.title3_lod.push_back(0); - data.l1_lod.push_back(0); - CHECK(!data.title1_all.empty()); - CHECK(!data.title2_all.empty()); - CHECK(!data.title3_all.empty()); - CHECK(!data.l1_all.empty()); - CHECK_EQ(data.title1_all.size(), data.title2_all.size()); - CHECK_EQ(data.title1_all.size(), data.title3_all.size()); - CHECK_EQ(data.title1_all.size(), data.l1_all.size()); - for (size_t j = 0; j < data.title1_all.size(); j++) { - data.title1.push_back(data.title1_all[j]); - data.title2.push_back(data.title2_all[j]); - data.title3.push_back(data.title3_all[j]); - data.l1.push_back(data.l1_all[j]); - // calculate lod - data.title1_lod.push_back(data.title1_lod.back() + - data.title1_all[j].size()); - data.title2_lod.push_back(data.title2_lod.back() + - data.title2_all[j].size()); - data.title3_lod.push_back(data.title3_lod.back() + - data.title3_all[j].size()); - data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size()); - } + if (batch_end <= title1.size()) { + GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end); + GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end); + GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end); } batch_iter += batch_size; return data; @@ -92,10 +59,10 @@ struct DataRecord { // load l1 data std::vector l1_data; split_to_int64(data[3], ' ', &l1_data); - title1_all.push_back(std::move(title1_data)); - title2_all.push_back(std::move(title2_data)); - title3_all.push_back(std::move(title3_data)); - l1_all.push_back(std::move(l1_data)); + title1.push_back(std::move(title1_data)); + title2.push_back(std::move(title2_data)); + title3.push_back(std::move(title3_data)); + l1.push_back(std::move(l1_data)); } num_samples = num_lines; } @@ -109,24 +76,11 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, title3_tensor.name = "title3"; l1_tensor.name = "l1"; auto one_batch = data->NextBatch(); - int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1]; - title1_tensor.shape.assign({title1_size, 1}); - title1_tensor.lod.assign({one_batch.title1_lod}); - int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1]; - title2_tensor.shape.assign({title2_size, 1}); - title2_tensor.lod.assign({one_batch.title2_lod}); - int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1]; - title3_tensor.shape.assign({title3_size, 1}); - title3_tensor.lod.assign({one_batch.title3_lod}); - int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1]; - l1_tensor.shape.assign({l1_size, 1}); - l1_tensor.lod.assign({one_batch.l1_lod}); - // assign data - TensorAssignData(&title1_tensor, one_batch.title1); - TensorAssignData(&title2_tensor, one_batch.title2); - TensorAssignData(&title3_tensor, one_batch.title3); - TensorAssignData(&l1_tensor, one_batch.l1); + TensorAssignData(&title1_tensor, one_batch.title1, one_batch.lod1); + TensorAssignData(&title2_tensor, one_batch.title2, one_batch.lod2); + TensorAssignData(&title3_tensor, one_batch.title3, one_batch.lod3); + TensorAssignData(&l1_tensor, one_batch.l1, one_batch.l1_lod); // Set inputs. input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index ef7e2198c5db4e723406fe1a31d2c3dde5121931..7eb44d9f4ea6e27a504984eac4f960bddc9032e1 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -176,6 +176,18 @@ void SetFakeImageInput(std::vector> *inputs, (*inputs).emplace_back(input_slots); } +void GetInputPerBatch(const std::vector> &in, + std::vector> *out, + std::vector *lod, size_t batch_iter, + size_t batch_end) { + lod->clear(); + lod->push_back(0); + for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) { + out->push_back(*it); + lod->push_back(lod->back() + (*it).size()); // calculate lod + } +} + void TestOneThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index ab3a30ce6bba14a7d5ec700a159d90031e6b5dc7..29f0f034a2aab50330d4d0127b870a5cb00d56a5 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -3,14 +3,16 @@ set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING "A path setting inference demo download directories.") function (inference_download install_dir url filename) message(STATUS "Download inference test stuff from ${url}/${filename}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") + file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}") message(STATUS "finish downloading ${filename}") endfunction() function (inference_download_and_uncompress install_dir url filename) inference_download(${install_dir} ${url} ${filename}) - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") + execute_process( + COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename} + WORKING_DIRECTORY ${install_dir} + ) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4a14eb941cd98e333a3e85aff064e6099b3be396..ee154207754f20791bdba55d0d66aea600e8dee6 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -46,7 +46,7 @@ endif() register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) # warpctc_op needs cudnn 7 above -if (WITH_GPU AND NOT WIN32) +if (WITH_GPU) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 855c4d70677395992e2bf685c910cbea2d37b20b..49e734ce96b0d38b59102575250a020e6924362a 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -64,7 +64,7 @@ class ClipByNormKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); } else { PADDLE_THROW("Unexpected branch, input variable type is %s", - in_var->Type().name()); + framework::ToTypeName(in_var->Type())); } PADDLE_ENFORCE_NOT_NULL(input); diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 48800947fd387bf4d84a85e82fdcd7efa3f08de5..0360cf5273591946570cac47e2578e43f498b550 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -175,14 +175,13 @@ class WhileGradOp : public framework::OperatorBase { auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name), "Cannot find inside gradient %s", inside_og_name); - if (framework::IsType(og_outside.Type())) { + if (og_outside.IsType()) { auto &outside_tensor = og_outside.Get(); auto &inside_tensor = detail::Ref(og_inside.GetMutable()); inside_tensor.set_lod(outside_tensor.lod()); inside_tensor.ShareDataWith(outside_tensor); - } else if (framework::IsType( - og_outside.Type())) { + } else if (og_outside.IsType()) { auto &outside_array = og_outside.Get(); auto &inside_array = detail::Ref(og_inside.GetMutable()); @@ -256,7 +255,7 @@ class WhileGradOp : public framework::OperatorBase { var->IsType(), "Currently the type of var only can be LoDTensorArray, " "or LoDTensor, but the received var[%s] is %s.", - inside_grad_name, var->Type().name()); + inside_grad_name, framework::ToTypeName(var->Type())); if (var->IsType()) { auto &inside_tensor = var->Get(); diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 3235ad52b999e1ca3f992034781edaab9921a300..acceadab16493177754d3a8e27ee800a90c0adc8 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -22,7 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search); namespace paddle { namespace operators { -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; @@ -204,7 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel, ops::CUDNNConvFusionOpKernel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index f2ba75485c58789de848b8833a1a527d45ced83c..fae0925149146dc06c50e0a9ff61b1686a9b7b5c 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { @@ -22,239 +22,6 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -struct CudnnRNNCache { - CudnnRNNCache() { - x_desc_ = NULL; - y_desc_ = NULL; - dx_desc_ = NULL; - dy_desc_ = NULL; - } - ~CudnnRNNCache() { release(); } - - cudnnRNNDescriptor_t rnn_desc_; - cudnnTensorDescriptor_t *x_desc_; - cudnnTensorDescriptor_t *y_desc_; - cudnnTensorDescriptor_t *dx_desc_; - cudnnTensorDescriptor_t *dy_desc_; - - cudnnTensorDescriptor_t hx_desc_; - cudnnTensorDescriptor_t cx_desc_; - cudnnTensorDescriptor_t hy_desc_; - cudnnTensorDescriptor_t cy_desc_; - - cudnnTensorDescriptor_t dhx_desc_; - cudnnTensorDescriptor_t dcx_desc_; - cudnnTensorDescriptor_t dhy_desc_; - cudnnTensorDescriptor_t dcy_desc_; - - cudnnTensorDescriptor_t output_x_desc_; - cudnnTensorDescriptor_t output_y_desc_; - - cudnnDropoutDescriptor_t dropout_desc_; - - size_t weights_size_; - cudnnFilterDescriptor_t w_desc_; - cudnnFilterDescriptor_t dw_desc_; - - size_t workspace_size_; - size_t reserve_size_; - Tensor reserve_data_; - Tensor workspace_data_; - - Tensor dropout_state_; - - size_t max_length_; - - float dropout_prob_; - bool is_bidirec_; - - int batch_size_; - int input_size_; - int hidden_size_; - int num_layers_; - int seed_; - - void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx, - size_t max_len, int batch_size, int input_size, int hidden_size, - int num_layers, float dropout_prob, bool is_bidirec, int seed, - int weight_numel) { - max_length_ = max_len; - batch_size_ = batch_size; - input_size_ = input_size; - hidden_size_ = hidden_size; - num_layers_ = num_layers; - dropout_prob_ = dropout_prob; - is_bidirec_ = is_bidirec; - seed_ = seed; - - x_desc_ = new cudnnTensorDescriptor_t[max_length_]; - y_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; - int dim_a[3]; - int stride_a[3]; - - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); - dim_a[0] = batch_size_; - dim_a[1] = input_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - dim_a[0] = batch_size_; - dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - } - - dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); - dim_a[1] = batch_size_; - dim_a[2] = hidden_size_; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - CUDNN_ENFORCE( - platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); - - size_t state_size; - CUDNN_ENFORCE( - platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); - dropout_state_.Resize({static_cast(state_size)})); - auto *dropout_state_data = - dropout_state_.mutable_data(ctx.GetPlace()); - CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( - dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, - seed_)); - - CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); - -#if CUDNN_VERSION >= 6000 - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( - handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, - CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); -#else - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_DATA_FLOAT)); -#endif - - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( - handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); - - PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, - "cudnn lstm weight size should be SAME"); - int dim_w[3]; - dim_w[0] = weights_size_ / sizeof(float); - dim_w[1] = 1; - dim_w[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( - handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( - handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); - - reserve_data_.Resize({static_cast(reserve_size_)}); - reserve_data_.mutable_data(ctx.GetPlace()); - - workspace_data_.Resize({static_cast(workspace_size_)}); - workspace_data_.mutable_data(ctx.GetPlace()); - } - - void release() { - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); - } - - delete[] x_desc_; - delete[] y_desc_; - delete[] dx_desc_; - delete[] dy_desc_; - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); - - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); - } -}; - template class CudnnLSTMGPUKernel : public framework::OpKernel { public: @@ -315,9 +82,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { auto input_w_numel = w->numel(); auto batch_size = x->dims()[1]; - cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, - hidden_size, num_layers, dropout_prob, is_bidirec, - seed, input_w_numel); + cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size, + input_size, hidden_size, num_layers, dropout_prob, + is_bidirec, seed, input_w_numel); } auto run_seq_len = x->dims()[0]; diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..7f18b839271a29523cc06c999c28cc0394717397 --- /dev/null +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -0,0 +1,255 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +struct CudnnRNNCache { + CudnnRNNCache() { + x_desc_ = NULL; + y_desc_ = NULL; + dx_desc_ = NULL; + dy_desc_ = NULL; + } + ~CudnnRNNCache() { release(); } + + cudnnRNNDescriptor_t rnn_desc_; + cudnnTensorDescriptor_t *x_desc_; + cudnnTensorDescriptor_t *y_desc_; + cudnnTensorDescriptor_t *dx_desc_; + cudnnTensorDescriptor_t *dy_desc_; + + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnTensorDescriptor_t dhx_desc_; + cudnnTensorDescriptor_t dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_; + cudnnTensorDescriptor_t dcy_desc_; + + cudnnTensorDescriptor_t output_x_desc_; + cudnnTensorDescriptor_t output_y_desc_; + + cudnnDropoutDescriptor_t dropout_desc_; + + size_t weights_size_; + cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t dw_desc_; + + size_t workspace_size_; + size_t reserve_size_; + framework::Tensor reserve_data_; + framework::Tensor workspace_data_; + + framework::Tensor dropout_state_; + + size_t max_length_; + + float dropout_prob_; + bool is_bidirec_; + + int batch_size_; + int input_size_; + int hidden_size_; + int num_layers_; + int seed_; + + void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len, + int batch_size, int input_size, int hidden_size, int num_layers, + float dropout_prob, bool is_bidirec, int seed, int weight_numel) { + max_length_ = max_len; + batch_size_ = batch_size; + input_size_ = input_size; + hidden_size_ = hidden_size; + num_layers_ = num_layers; + dropout_prob_ = dropout_prob; + is_bidirec_ = is_bidirec; + seed_ = seed; + + x_desc_ = new cudnnTensorDescriptor_t[max_length_]; + y_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; + int dim_a[3]; + int stride_a[3]; + + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); + dim_a[0] = batch_size_; + dim_a[1] = input_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + dim_a[0] = batch_size_; + dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + } + + dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); + dim_a[1] = batch_size_; + dim_a[2] = hidden_size_; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); + + size_t state_size; + CUDNN_ENFORCE( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); + dropout_state_.Resize({static_cast(state_size)})); + auto *dropout_state_data = dropout_state_.mutable_data(place); + CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, + seed_)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + +#if CUDNN_VERSION >= 6000 + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); +#else + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_DATA_FLOAT)); +#endif + + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); + + PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, + "cudnn lstm weight size should be SAME"); + int dim_w[3]; + dim_w[0] = weights_size_ / sizeof(float); + dim_w[1] = 1; + dim_w[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + + reserve_data_.Resize({static_cast(reserve_size_)}); + reserve_data_.mutable_data(place); + + workspace_data_.Resize({static_cast(workspace_size_)}); + workspace_data_.mutable_data(place); + } + + void release() { + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); + } + + delete[] x_desc_; + delete[] y_desc_; + delete[] dx_desc_; + delete[] dy_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 999fdcff90784ed089cd620a4f0a908f196bcdda..7c0fda4169b5e1cf663d04b78b6425d73965c292 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h index a800d5df0a7cbc668a0217350098bce2bfdcfa70..8660bc219c12fe8676e3be646c4b878a39700201 100644 --- a/paddle/fluid/operators/detail/safe_ref.h +++ b/paddle/fluid/operators/detail/safe_ref.h @@ -25,7 +25,7 @@ namespace detail { */ template inline T& Ref(T* ptr, ARGS&&... args) { - PADDLE_ENFORCE(ptr != nullptr, args...); + PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...)); return *ptr; } diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h index d2b0eb6ca6de1984dc7cfc2a662c88d5e56e1e05..27ca1f4edc04f5fca54b1a6340243634a596939c 100644 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -84,7 +84,9 @@ class ProtoEncodeHelper { ~ProtoEncodeHelper() { #define REPLACE_ENFORCE_GLOG 1 // Make sure callers didn't do operations that went over max_size promised - paddle::platform::throw_on_error(p_ <= limit_); + if (paddle::platform::is_error(p_ <= limit_)) { + paddle::platform::throw_on_error(p_ <= limit_); + } #undef REPLACE_ENFORCE_GLOG } diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 3c0b7ff24f9cf0b128d32aa185e5e3cb47d9c4b9..a8bb597cbd59290df1347c164d37104c6ac431e9 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -33,7 +33,7 @@ register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS}) if(WITH_GPU AND NOT WIN32) set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common) - op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common) + op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common) endif() set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h index acc9b1e6227942781db61a3bc50b2ac95865f79c..6676ecd1c85d70cd5961af2fb1537e77b10e41bc 100644 --- a/paddle/fluid/operators/distributed_ops/split_ids_op.h +++ b/paddle/fluid/operators/distributed_ops/split_ids_op.h @@ -116,7 +116,7 @@ class SplitIdsOpKernel : public framework::OpKernel { } else { PADDLE_THROW( "% should be LoDTensor or SelectedRows, but the received type is %s", - ctx.Inputs("Ids")[0], ids_var->Type().name()); + ctx.Inputs("Ids")[0], framework::ToTypeName(ids_var->Type())); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index a8b8a67a114b956f2d6b1b072ef343a179114b34..7a7a3989c047ae379cc14e2f783662db99239445 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -83,7 +83,7 @@ class ElementwiseMulKernel : public framework::OpKernel { z = ctx.Output("Out"); } else { PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - x_var->Type().name()); + framework::ToTypeName(x_var->Type())); } z->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 0a18882e8199c2a375a230a693b8b01d12aabfa0..4e4f977fcc742856b877ef0b7f9a3cc9879aefce 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -50,8 +50,8 @@ template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + const bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); @@ -132,8 +132,8 @@ template class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + const bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); PADDLE_ENFORCE( diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h index 6c616aa03d9809e9b7725a700c7edd5ff5d6dc42..3f51bb0b3d6ddf41a08a64f254f76c88b60ced22 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.h +++ b/paddle/fluid/operators/optimizers/adadelta_op.h @@ -27,12 +27,14 @@ class AdadeltaOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto avg_squared_grad_out_tensor = diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h index 9f6ef391696aa8718be71ae945e746b876813d94..13455fc42cdc72a8ebfcac3dc0c94b79497d91f6 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.h +++ b/paddle/fluid/operators/optimizers/adagrad_op.h @@ -50,7 +50,8 @@ class AdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto *param_out_tensor = ctx.Output("ParamOut"); auto *moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 1138bb7400e0e7a00983e7bfaad2b2d9704b77ab..5c559484ec95e794ebbbe0e713cb9e26b5c01b98 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -347,7 +347,8 @@ class AdamOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h index 7137fbd9651b4523f6d1609a0595b30758aa40df..55d25ecbddf175c0c9ba2c68ef2f6c7b83dcf32e 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.h +++ b/paddle/fluid/operators/optimizers/adamax_op.h @@ -27,12 +27,14 @@ class AdamaxOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h index 5df43d33ef9f720fd20d57c53ff37cc85440b24e..4abd436927707f1a18039c9104a92b2a0bf3c982 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h @@ -27,12 +27,14 @@ class DecayedAdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index 8f812c9a037bfac8c1e29e32a5ad5b077c8153d1..bbf34d8316b09a78c334b0d79b132639be8af4f7 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -32,12 +32,14 @@ class FTRLOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto* param_out = ctx.Output("ParamOut"); auto* sq_accum_out = ctx.Output("SquaredAccumOut"); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index f6ef83c3bad23d709b386f8e75bbc97fa9ba0aab..3ed1bff5ff4993e9c858dea8d56a8cb6124aca89 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -395,7 +395,7 @@ class MomentumOpKernel : public framework::OpKernel { PADDLE_THROW( string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows " "gradient, but the received Variable Type is %s", - grad_var->Type().name())); + framework::ToTypeName(grad_var->Type()))); } } }; diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index a9d303d55d8f681fe3a014db36ede5ef6b2742bd..975e4b8e7212bc61d5df0ca350fcf12afb463cba 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -60,7 +60,8 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto* param = ctx.Input("Param"); auto* param_out = ctx.Output("ParamOut"); diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc index f9a16ef35ecb9eeb6c8eda9d124ecb17e7f9d5ce..c39f94637a1abb5bfce9a5428419282f2b870c91 100644 --- a/paddle/fluid/operators/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/sum_mkldnn_op.cc @@ -245,7 +245,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 4f717a43551d6d79292bd1d49664d35588a8ec3a..01996e6bf975227270914aa6bec26aacfc814c94 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -126,7 +126,7 @@ class SumOp : public framework::OperatorWithKernel { PADDLE_THROW("Cannot find the input data type by all input data"); } PADDLE_THROW("Unexpected branch. Input type is %s", - x_vars[0]->Type().name()); + framework::ToTypeName(x_vars[0]->Type())); } }; diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 76cc796a9b8e21849b1d86e512cd70752fd027ac..a8b2df186dbfcb2a913e9532e2a475f1ad0d23a1 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -163,7 +163,7 @@ class SumKernel : public framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 01ee67fd07f848356e801be95d53a61bb5b08e37..06680539507886becb751d02cb02abbf702f1948 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -140,68 +140,72 @@ struct EOFException : public std::exception { #define LIKELY(condition) (condition) #endif +inline bool is_error(bool stat) { return !stat; } + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { - if (UNLIKELY(!(stat))) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(string::Sprintf(args...)); + throw std::runtime_error(string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } } #ifdef PADDLE_WITH_CUDA +inline bool is_error(cudaError_t e) { return UNLIKELY(e); } + template inline typename std::enable_if::type throw_on_error( cudaError_t e, const Args&... args) { - if (UNLIKELY(e)) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( curandStatus_t stat, const Args&... args) { - if (stat != CURAND_STATUS_SUCCESS) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cudnnStatus_t stat) { + return stat != CUDNN_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cudnnStatus_t stat, const Args&... args) { - if (stat == CUDNN_STATUS_SUCCESS) { - return; - } else { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cublasStatus_t stat) { + return stat != CUBLAS_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cublasStatus_t stat, const Args&... args) { std::string err; - if (stat == CUBLAS_STATUS_SUCCESS) { - return; - } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { err = "CUBLAS: not initialized, "; } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { err = "CUBLAS: alloc failed, "; @@ -254,21 +258,49 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) +#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; + +#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \ + ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG)); + +#define __PADDLE_THROW_ON_ERROR(COND, ...) \ + __PADDLE_THROW_ERROR_I( \ + __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)) + +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \ + } \ + } while (0) + #ifndef REPLACE_ENFORCE_GLOG -#define PADDLE_ENFORCE(...) \ +#define __PADDLE_ENFORCE_I(COND, ...) \ do { \ try { \ - ::paddle::platform::throw_on_error(__VA_ARGS__); \ + __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ } \ - } while (false) + } while (0) #else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); +#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG +#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args +#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__)) + #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index d52182965552e9ec945cb7d0b421d8addcb758e9..1091badae54a809c4a9da6d0398bcbb538420af0 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -37,6 +37,25 @@ TEST(ENFORCE, FAILED) { HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all")); } EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false, "Enforce is not ok at all"); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "Enforce is not ok at all")); + } + EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_NE(std::string(error.what()).find(" at "), 0); + } + EXPECT_TRUE(caught_exception); } TEST(ENFORCE, NO_ARG_OK) { diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index 27e930e6e0a76982b3f27619f38a4a08d82cafa1..3a937dfaec3acc7c116f0077694e9aee1b379061 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/init.h" diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index e2b7ca9b03809113c31af8ff4d3ad3713748f330..b1b51d804e02f233bcd16149005092dc80e9c79d 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -11,6 +11,7 @@ limitations under the License. */ #include "paddle/fluid/platform/float16.h" +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include #include diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index f8ded9f94ecaf3df1e14aead60ae12abcf8c34a9..06d8b65fb1480d9f621ca937c1d66ab7e910f010 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -49,9 +49,6 @@ void BindConstValue(pybind11::module* m) { op_proto_and_checker_maker.def( "kOpNameScopeAttrName", framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); - op_proto_and_checker_maker.def( - "kOpCreationCallstackAttrName", - framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName); } } // namespace pybind diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index a2eec6e3c48dd126614bbff0227145537b678ac4..0b94b60018aac3a61edfda4d7ecb762e9fe70673 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -87,7 +87,7 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { template std::string Sprintf(const Args&... args) { std::ostringstream oss; - Fprintf(oss, ""); + Fprintf(oss, "%s", args...); return oss.str(); } diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3427fb0c4ae4dad0323b39cf85bfb5b04f796996..de30ed2fc5858187d2ecede299832701304e4198 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -20,7 +20,6 @@ import os import re import six import sys -import traceback import numpy as np @@ -605,10 +604,6 @@ class Operator(object): if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: del op_attrs[role_var_name] - callstack_var_name = op_maker.kOpCreationCallstackAttrName() - op_attrs[callstack_var_name] = list( - reversed(traceback.format_stack()))[1:] - if len(self.desc.type()) != 0: return if type is None: diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 74cf76da951a4cea884c4fdb8591b3d4fb010300..c97a93ec36d4f4a7ff6a9f097551e2d21022d5b1 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -148,7 +148,7 @@ class ParallelExecutor(object): trainers_endpoints), "num_trainers == len(end_points)" build_strategy.trainers_endpoints = trainers_endpoints - # step5: get persistable_vars, parameter_vars, places. persistable_vars + # step6: get persistable_vars, places. persistable_vars # need be broadcast to other local_scope. persistable_vars = set([ cpt.to_text(v.name) for v in [ @@ -164,7 +164,7 @@ class ParallelExecutor(object): places = list(map(place_obj, self._places)) - # step6: init ParallelExecutor + # step7: init ParallelExecutor self.executor = core.ParallelExecutor( places, persistable_vars, main.desc, cpt.to_text(loss_name) diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 37b9a9188ab44df81029ae6d9925ae21c1929cff..4153394c1da776d0a41e1415a09fa7d6f4b14d6d 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase): set(mul_op.attr_names), set([ "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", - "op_namescope", "op_callstack" + "op_namescope" ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py new file mode 100644 index 0000000000000000000000000000000000000000..f37d2bfb2e86b452cf7fd05c3e5871de2e33d629 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py @@ -0,0 +1,188 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import contextlib + +import unittest +from functools import partial +import numpy as np +import paddle +import paddle.fluid.core as core + +import paddle.fluid as fluid + + +def get_places(): + places = [] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + +@contextlib.contextmanager +def prog_scope_guard(main_prog, startup_prog): + scope = fluid.core.Scope() + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + with fluid.program_guard(main_prog, startup_prog): + yield + + +def bow_net(data, + label, + dict_dim, + is_sparse=False, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestWeightDecay(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + reader = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), batch_size=4)() + self.train_data = [next(reader) for _ in range(5)] + self.learning_rate = .5 + + def run_executor(self, place, feed_list, loss): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + main_prog = fluid.default_main_program() + loss_set = [] + for data in self.train_data: + out = exe.run(main_prog, + feed=feeder.feed(data), + fetch_list=[loss.name]) + + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def run_parallel_exe(self, + place, + feed_list, + loss, + use_cuda=True, + use_reduce=False, + use_fast_executor=False, + use_ir_memory_optimize=False): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + + exec_strategy = fluid.ExecutionStrategy() + if use_fast_executor: + exec_strategy.use_experimental_executor = True + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.memory_optimize = use_ir_memory_optimize + + parallel_exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + + loss_set = [] + for data in self.train_data: + out = parallel_exe.run(feed=feeder.feed(data), + fetch_list=[loss.name]) + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def check_weight_decay(self, + place, + model, + use_parallel_exe=False, + use_reduce=False): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog): + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost = model(data, label, len(self.word_dict)) + + param_list = [(var, var * self.learning_rate) + for var in main_prog.block(0).all_parameters()] + + optimizer = fluid.optimizer.Adagrad( + learning_rate=self.learning_rate) + + optimizer.minimize(avg_cost) + + for params in param_list: + updated_p = fluid.layers.elementwise_sub( + x=params[0], y=params[1]) + fluid.layers.assign(input=updated_p, output=params[0]) + + if use_parallel_exe: + loss = self.run_parallel_exe( + place, [data, label], + loss=avg_cost, + use_cuda=True, + use_reduce=use_reduce) + else: + loss = self.run_executor(place, [data, label], loss=avg_cost) + + return loss + + def test_weight_decay(self): + model = partial(bow_net, is_sparse=False) + for place in get_places(): + loss = self.check_weight_decay(place, model, use_parallel_exe=False) + + loss2 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=False) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss2[i], rtol=5e-5) + + loss3 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=True) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss3[i], rtol=5e-5) + + +if __name__ == '__main__': + unittest.main()