From 3232618af80bbe3ff29bd504a5e5434409bb5221 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 5 Jul 2019 11:38:15 +0800 Subject: [PATCH] checkerrpick Make fuse_all_reduce_op_pass support mix_precision test=develop test=release (#18490) --- .../framework/details/multi_devices_helper.h | 6 +- .../alloc_continuous_space_for_grad_pass.cc | 283 +++++++++++------- .../fuse_optimizer_op_pass.cc | 11 +- .../fuse_all_reduce_op_pass.cc | 69 +++-- .../operators/alloc_continuous_space_op.cc | 5 + paddle/fluid/operators/optimizers/sgd_op.cc | 13 +- paddle/fluid/operators/optimizers/sgd_op.cu | 6 +- .../test_mix_precision_all_reduce_fuse.py | 91 ++++++ 8 files changed, 332 insertions(+), 152 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index e97e5f439d..1cc6760fc5 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -58,15 +58,15 @@ constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; typedef std::string FusedOptType; constexpr char kFusedOptType[] = "fused_opt_type"; -typedef std::string FusedGrads; +typedef std::vector FusedGrads; constexpr char kFusedGrads[] = "fused_gradients"; typedef std::vector> ParamsAndGrads; constexpr char kParamsAndGrads[] = "params_grads"; typedef std::vector>> - GroupGradsAndParams; -constexpr char kGroupGradsAndParams[] = "group_grads_params"; + GroupParamsAndGrads; +constexpr char kGroupParamsAndGrads[] = "group_params_grads"; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc index 6f519d92e4..3fc3acce47 100644 --- a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc +++ b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h" #include +#include #include #include #include @@ -52,18 +53,13 @@ static constexpr double kMB = 1048576.0; void SetFuseParameterGroupsSize(int group_size) { FLAGS_fuse_parameter_groups_size = group_size; } - int GetFuseParameterGroupsSize() { return FLAGS_fuse_parameter_groups_size; } void SetFuseParameterMemorySize(double memory_size) { FLAGS_fuse_parameter_memory_size = memory_size; } - double GetFuseParameterMemorySize() { return FLAGS_fuse_parameter_memory_size; } -static framework::proto::VarType::Type kDefaultDtype = - framework::proto::VarType::Type::VarType_Type_BOOL; - class AllocContinuousSpaceForGradPass : public ir::Pass { protected: void ApplyImpl(ir::Graph *graph) const { @@ -73,19 +69,16 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { auto &local_scopes = Get>(details::kLocalScopes); ResetAttribute(details::kParamsAndGrads, &result); - ResetAttribute(details::kGroupGradsAndParams, + ResetAttribute(details::kGroupParamsAndGrads, &result); - // NOTE: The operator nodes should be in topology order. - std::vector topo_nodes = ir::TopologySortOperations(result); auto ¶ms_grads = result.Get(details::kParamsAndGrads); - for (auto &node : topo_nodes) { - RecordParamsAndGrads(node, ¶ms_grads); - } + RecordParamsAndGrads(result, ¶ms_grads); - if (params_grads.size() == 0) { - LOG(INFO) << "Doesn't find gradients"; + auto num_params_grads = params_grads.size(); + VLOG(10) << "The number of params and grads is:" << num_params_grads; + if (num_params_grads == 0) { return; } @@ -101,24 +94,43 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } } - auto &group_grads_params = - result.Get(details::kGroupGradsAndParams); - - // Note: the order of params_grads may be changed by SetGroupGradsAndParams. - SetGroupGradsAndParams(var_name2node, params_grads, &group_grads_params); + auto &group_params_grads = + result.Get(details::kGroupParamsAndGrads); + // Note: the order of params_grads may be changed by SetGroupParamsAndGrads. + SetGroupParamsAndGrads(var_name2node, params_grads, &group_params_grads); params_grads.clear(); - for (auto &group_p_g : group_grads_params) { - params_grads.insert(params_grads.begin(), group_p_g.begin(), + params_grads.reserve(num_params_grads); + for (auto &group_p_g : group_params_grads) { + params_grads.insert(params_grads.end(), group_p_g.begin(), group_p_g.end()); } - for (auto &p_g : params_grads) { - std::swap(p_g.first, p_g.second); + PADDLE_ENFORCE_EQ( + num_params_grads, params_grads.size(), + "The number of params_grads is not consistent with before."); + + if (IsUnifiedDtype(params_grads, var_name2node)) { + SetGradientPersistable(params_grads, var_name2node, var_name2node_set); + AllocContinuousAddressSpace(places, local_scopes, var_name2node, + params_grads, &result); + } else { + // Set Gradients as Persistable to prevent this var becoming reusable. + for (auto &sub_param_grad : group_params_grads) { + SetGradientPersistable(params_grads, var_name2node, var_name2node_set); + PADDLE_ENFORCE(IsUnifiedDtype(sub_param_grad, var_name2node), + "The data type of the same group is not consistent."); + AllocContinuousAddressSpace(places, local_scopes, var_name2node, + sub_param_grad, &result); + } } + } - // Set Gradients as Persistable to prevent this var becoming reusable. - auto dtype = kDefaultDtype; - for (auto &p_g : params_grads) { + void SetGradientPersistable( + const std::vector> &sub_param_grad, + const std::unordered_map &var_name2node, + const std::unordered_map> + &var_name2node_set) const { + for (auto &p_g : sub_param_grad) { // Get gradient var auto iter = var_name2node.find(p_g.second); PADDLE_ENFORCE(iter != var_name2node.end(), "%s is not found.", @@ -132,32 +144,45 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType())); + } + } - // Get Dtype - auto ele_dtype = iter->second->Var()->GetDataType(); - if (dtype == kDefaultDtype) { - dtype = ele_dtype; - PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype, - "The data type should not be bool."); + bool IsUnifiedDtype( + const details::ParamsAndGrads ¶ms_grads, + const std::unordered_map &var_name2node) const { + auto dtype = + this->GetDtypeOfVar(var_name2node, params_grads.front().second); + for (auto p_g : params_grads) { + auto next_dtype = this->GetDtypeOfVar(var_name2node, p_g.second); + if (next_dtype != dtype) { + return false; } - PADDLE_ENFORCE_EQ(ele_dtype, dtype, - "The data type of input is not consistent."); } + return true; + } + void AllocContinuousAddressSpace( + const std::vector &places, + const std::vector &local_scopes, + const std::unordered_map &var_name2node, + const details::ParamsAndGrads ¶ms_grads, Graph *result) const { // Create a FusedVarsSet to avoid duplicating names for fused_var in other // pass. - if (!result.Has(details::kFusedVars)) { - result.Set(details::kFusedVars, new details::FusedVars); + if (!result->Has(details::kFusedVars)) { + result->Set(details::kFusedVars, new details::FusedVars); } // the kFusedGrads is used be fuse_optimizer_op_pass. - result.Set(details::kFusedGrads, new details::FusedGrads); + if (!result->Has(details::kFusedGrads)) { + result->Set(details::kFusedGrads, new details::FusedGrads); + } // the fused_var_name should be unique, so it appends // params_grads.begin()->second. auto fused_var_name = std::string(details::kFusedVarNamePrefix) + "@GRAD@" + params_grads.begin()->second; - result.Get(details::kFusedGrads) = fused_var_name; - auto &fused_var_set = result.Get(details::kFusedVars); + result->Get(details::kFusedGrads) + .emplace_back(fused_var_name); + auto &fused_var_set = result->Get(details::kFusedVars); PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0, "%s is duplicate in FusedVars.", fused_var_name); fused_var_set.insert(fused_var_name); @@ -175,109 +200,126 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { graph->Set(attr_name, new AttrType); } - void SetGroupGradsAndParams( + void SetGroupParamsAndGrads( const std::unordered_map &var_nodes, const details::ParamsAndGrads ¶ms_grads, - details::GroupGradsAndParams *group_grads_params) const { - SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params); - SetGroupAccordingToMemorySize(var_nodes, group_grads_params); + details::GroupParamsAndGrads *group_params_grads) const { + SetGroupAccordingToLayers(var_nodes, params_grads, group_params_grads); + SetGroupAccordingToMemorySize(var_nodes, group_params_grads); } void SetGroupAccordingToLayers( const std::unordered_map &var_nodes, const details::ParamsAndGrads ¶ms_grads, - details::GroupGradsAndParams *group_grads_params) const { - std::unordered_map> layer_params; + details::GroupParamsAndGrads *group_params_grads) const { + using var_dtype = std::pair; + std::map var_idx; for (size_t i = 0; i < params_grads.size(); ++i) { auto pos = params_grads[i].first.find_first_of("."); + + auto dtype = GetDtypeOfVar(var_nodes, params_grads[i].second); + var_dtype var_key; if (pos == std::string::npos) { - layer_params[params_grads[i].first].emplace_back(i); + var_key = std::make_pair(params_grads[i].first, dtype); } else { - layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i); + var_key = std::make_pair(params_grads[i].first.substr(0, pos), dtype); } - } - group_grads_params->reserve(layer_params.size()); - for (size_t i = 0; i < params_grads.size(); ++i) { - auto pos = params_grads[i].first.find_first_of("."); - std::string key = params_grads[i].first; - if (pos != std::string::npos) { - key = params_grads[i].first.substr(0, pos); - } - auto iter = layer_params.find(key); - if (iter == layer_params.end()) continue; - - group_grads_params->emplace_back(); - auto &local_group_grads_params = group_grads_params->back(); - for (auto &idx : iter->second) { - local_group_grads_params.emplace_back( - std::make_pair(params_grads[idx].second, params_grads[idx].first)); + size_t idx = 0; + auto var_idx_iter = var_idx.find(var_key); + if (var_idx_iter != var_idx.end()) { + idx = var_idx_iter->second; + } else { + group_params_grads->emplace_back(); + idx = group_params_grads->size() - 1; + var_idx[var_key] = idx; } - layer_params.erase(iter); + auto &local_group_params_grads = group_params_grads->at(idx); + local_group_params_grads.emplace_back( + std::make_pair(params_grads[i].first, params_grads[i].second)); } - VLOG(10) << "SetGroupAccordingToLayers: "; if (VLOG_IS_ON(10)) { - PrintGroupInfo(var_nodes, group_grads_params); + VLOG(10) << "SetGroupAccordingToLayers: "; + PrintGroupInfo(var_nodes, group_params_grads); } } void PrintGroupInfo( const std::unordered_map &var_nodes, - details::GroupGradsAndParams *group_grads_params) const { - for (size_t i = 0; i < group_grads_params->size(); ++i) { + details::GroupParamsAndGrads *group_params_grads) const { + for (size_t i = 0; i < group_params_grads->size(); ++i) { VLOG(10) << "group " << i; std::stringstream out; size_t gps_size = 0; - for (auto &g_p : group_grads_params->at(i)) { - auto iter = var_nodes.find(g_p.second); - PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second); + for (auto &p_g : group_params_grads->at(i)) { + auto iter = var_nodes.find(p_g.first); + PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", p_g.first); auto shape = iter->second->Var()->GetShape(); size_t size = framework::SizeOfType(iter->second->Var()->GetDataType()); std::for_each(shape.begin(), shape.end(), [&size](const int64_t &n) { size *= n; }); gps_size += size; - out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first); + out << string::Sprintf("(%s(%d), %s)", p_g.first, size, p_g.second); } + + auto dtype = this->GetDtypeOfVar(var_nodes, + group_params_grads->at(i).front().first); + VLOG(10) << out.str() - << ", group size:" << group_grads_params->at(i).size() + << ", group size:" << group_params_grads->at(i).size() << ", group memory size:" << static_cast(gps_size) / kMB - << "(MB)"; + << "(MB)" + << ", dtype:" << dtype; } } void SetGroupAccordingToMemorySize( const std::unordered_map &var_nodes, - details::GroupGradsAndParams *group_grads_params) const { + details::GroupParamsAndGrads *group_params_grads) const { const double group_memory_size = GetFuseParameterMemorySize(); if (group_memory_size <= 0.0) { return; } - details::GroupGradsAndParams local_group_grads_params; + details::GroupParamsAndGrads local_group_params_grads; + size_t j = 0; - while (j < group_grads_params->size()) { - local_group_grads_params.emplace_back(); - auto &group_p_g = local_group_grads_params.back(); + while (j < group_params_grads->size()) { + local_group_params_grads.emplace_back(); + auto &group_p_g = local_group_params_grads.back(); + + auto &grad_name = group_params_grads->at(j).front().second; + auto var_type = GetDtypeOfVar(var_nodes, grad_name); + size_t local_group_memory_size = 0; - while (j < group_grads_params->size()) { + while (j < group_params_grads->size()) { std::for_each( - group_grads_params->at(j).begin(), group_grads_params->at(j).end(), + group_params_grads->at(j).begin(), group_params_grads->at(j).end(), [&local_group_memory_size, - &var_nodes](const std::pair &g_p) { - auto iter = var_nodes.find(g_p.second); + &var_nodes](const std::pair &p_g) { + auto iter = var_nodes.find(p_g.second); PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", - g_p.second); - auto shape = iter->second->Var()->GetShape(); + p_g.second); + size_t size = framework::SizeOfType(iter->second->Var()->GetDataType()); + auto shape = iter->second->Var()->GetShape(); std::for_each(shape.begin(), shape.end(), [&size](const int64_t &n) { size *= n; }); + local_group_memory_size += size; }); - group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(), - group_grads_params->at(j).end()); + + group_p_g.insert(group_p_g.end(), group_params_grads->at(j).begin(), + group_params_grads->at(j).end()); + ++j; + + if (j >= group_params_grads->size()) { + break; + } + if (GetFuseParameterGroupsSize() > 1 && group_p_g.size() > static_cast(GetFuseParameterGroupsSize())) { @@ -288,49 +330,64 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { group_memory_size) { break; } + + auto next_var_type = + GetDtypeOfVar(var_nodes, group_params_grads->at(j).front().second); + if (next_var_type != var_type) { + break; + } } } - std::swap(*group_grads_params, local_group_grads_params); - - VLOG(10) << string::Sprintf( - "SetGroupAccordingToMemorySize(memory_size: %f):", group_memory_size); + std::swap(*group_params_grads, local_group_params_grads); if (VLOG_IS_ON(10)) { - PrintGroupInfo(var_nodes, group_grads_params); + VLOG(10) << string::Sprintf( + "SetGroupAccordingToMemorySize(memory_size: %f):", group_memory_size); + PrintGroupInfo(var_nodes, group_params_grads); } } + proto::VarType::Type GetDtypeOfVar( + const std::unordered_map &var_nodes, + const std::string &name) const { + auto grad_iter = var_nodes.find(name); + PADDLE_ENFORCE(grad_iter != var_nodes.end()); + PADDLE_ENFORCE_NOT_NULL(grad_iter->second->Var()); + return grad_iter->second->Var()->GetDataType(); + } + private: bool IsSupportedVarType(const proto::VarType::Type &type) const { // Current only support LOD_TENSOR. return type == proto::VarType::LOD_TENSOR; } - void RecordParamsAndGrads(ir::Node *node, + void RecordParamsAndGrads(const ir::Graph &graph, details::ParamsAndGrads *params_grads) const { - try { - bool is_bk_op = - static_cast(boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward)); - if (!is_bk_op) return; - - // Currently, we assume that once gradient is generated, it can be - // broadcast, and each gradient is only broadcast once. - auto backward_vars = - boost::get>(node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast(0)); - - for (size_t i = 0; i < backward_vars.size(); i += 2) { - VLOG(10) << "Trainable parameter: " << backward_vars[i] - << ", gradient: " << backward_vars[i + 1]; - - params_grads->emplace_back(std::make_pair( - backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/)); + std::vector topo_nodes = ir::TopologySortOperations(graph); + for (auto &node : topo_nodes) { + try { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + auto backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast(0)); + for (size_t i = 0; i < backward_vars.size(); i += 2) { + VLOG(10) << "Trainable parameter: " << backward_vars[i] + << ", gradient: " << backward_vars[i + 1]; + + params_grads->emplace_back(std::make_pair( + backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/)); + } + } catch (boost::bad_get e) { } - } catch (boost::bad_get e) { } } diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc index f636bcd0ea..4119d39598 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc @@ -101,10 +101,17 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { "this pass."); } auto &fused_grad = result.Get(details::kFusedGrads); + PADDLE_ENFORCE_NE(fused_grad.size(), 0, + "The fused gradient should not be empty."); + PADDLE_ENFORCE_EQ(fused_grad.size(), 1, + "Because the dtype of those gradients " + "is not unified, so the number of fused gradients is " + "more than one, but it is not supported currently."); auto &fused_vars = result.Get(details::kFusedVars); - auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); + auto iter = + std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front()); PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); - fused_vars_name[kGrad] = fused_grad; + fused_vars_name[kGrad] = fused_grad.front(); // Sort the parameters and auxiliary variables according // to parameters' name to make variables' name correspond correctly. diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc index abfaf1b8d2..77173b5866 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc @@ -30,7 +30,6 @@ class FuseAllReduceOpPass : public ir::Pass { protected: void ApplyImpl(ir::Graph *graph) const override { ir::Graph &result = *graph; - auto &places = Get>(details::kPlaces); auto &local_scopes = Get>(details::kLocalScopes); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) @@ -38,38 +37,17 @@ class FuseAllReduceOpPass : public ir::Pass { &Get(details::kNCCLCtxs); #endif - std::unordered_set grads; auto ¶ms_grads = result.Get(details::kParamsAndGrads); size_t num_of_all_reduce = params_grads.size(); + std::unordered_set grads; grads.reserve(num_of_all_reduce); for (auto p_g : params_grads) { grads.insert(p_g.second); } - size_t num_place = places.size(); - std::unordered_map all_reduce_ops; - all_reduce_ops.reserve(grads.size()); - for (auto &node : result.Nodes()) { - if (node->IsOp()) { - PADDLE_ENFORCE(node->IsWrappedBy()); - auto *all_reduce_op_handle = dynamic_cast( - &node->Wrapper()); - if (all_reduce_op_handle) { - auto inputs = details::DynamicCast( - all_reduce_op_handle->Inputs()); - PADDLE_ENFORCE_EQ(inputs.size(), num_place); - // The inputs' name should be the same. - auto &grad_name = inputs[0]->name(); - for (size_t i = 1; i < inputs.size(); ++i) { - PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name, - "The input name should be the same."); - } - PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast(0)); - all_reduce_ops.emplace(grad_name, node); - } - } - } + std::unordered_map all_reduce_ops = + GetAllReduceOps(result, places, grads); VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size(); if (all_reduce_ops.size() == 0) { @@ -82,16 +60,16 @@ class FuseAllReduceOpPass : public ir::Pass { "it is not supported currently."); VLOG(10) << "Insert fused_all_reduce"; - auto &group_grads_params = - graph->Get(details::kGroupGradsAndParams); + auto &group_params_grads = + graph->Get(details::kGroupParamsAndGrads); - for (auto &group_g_p : group_grads_params) { - size_t group_size = group_g_p.size(); + for (auto &group_p_g : group_params_grads) { + size_t group_size = group_p_g.size(); PADDLE_ENFORCE_GT(group_size, static_cast(0)); std::vector group_all_reduce_ops; group_all_reduce_ops.reserve(group_size); - for (auto &g_p : group_g_p) { - group_all_reduce_ops.emplace_back(all_reduce_ops.at(g_p.first)); + for (auto &p_g : group_p_g) { + group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second)); } #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) InsertFusedAllReduce(places, local_scopes, group_size, @@ -103,6 +81,35 @@ class FuseAllReduceOpPass : public ir::Pass { } } + std::unordered_map GetAllReduceOps( + const Graph &result, const std::vector &places, + const std::unordered_set &grads) const { + size_t num_place = places.size(); + std::unordered_map all_reduce_ops; + all_reduce_ops.reserve(grads.size()); + for (auto &node : result.Nodes()) { + if (node->IsOp()) { + PADDLE_ENFORCE(node->IsWrappedBy()); + auto *all_reduce_op_handle = dynamic_cast( + &node->Wrapper()); + if (all_reduce_op_handle) { + auto inputs = details::DynamicCast( + all_reduce_op_handle->Inputs()); + PADDLE_ENFORCE_EQ(inputs.size(), num_place); + // The inputs' name should be the same. + auto &grad_name = inputs[0]->name(); + for (size_t i = 1; i < inputs.size(); ++i) { + PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name, + "The input name should be the same."); + } + PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast(0)); + all_reduce_ops.emplace(grad_name, node); + } + } + } + return all_reduce_ops; + } + void InsertFusedAllReduce(const std::vector &places, const std::vector &local_scopes, const size_t num_of_all_reduce, diff --git a/paddle/fluid/operators/alloc_continuous_space_op.cc b/paddle/fluid/operators/alloc_continuous_space_op.cc index 85da8a827f..59221c2e0c 100644 --- a/paddle/fluid/operators/alloc_continuous_space_op.cc +++ b/paddle/fluid/operators/alloc_continuous_space_op.cc @@ -227,8 +227,11 @@ REGISTER_OPERATOR(alloc_continuous_space, paddle::operators::AllocContinuousSpaceOp, paddle::operators::AllocContinuousSpaceOpMaker); namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_CPU_KERNEL( alloc_continuous_space, + ops::AllocContinuousSpaceKernel, ops::AllocContinuousSpaceKernel, ops::AllocContinuousSpaceKernel, ops::AllocContinuousSpaceKernel, ops::AllocContinuousSpaceKernel, ops::AllocContinuousSpaceKernel, ops::AllocContinuousSpaceKernel namespace paddle { namespace operators { @@ -46,6 +46,17 @@ class SGDOp : public framework::OperatorWithKernel { auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); return framework::OpKernelType(data_type, ctx.device_context()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + if (var_name == "LearningRate") { + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } }; class SGDOpInferVarType : public framework::VarTypeInference { diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index 975e4b8e72..fca982821a 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -46,7 +46,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, // Atomic Operation to avoid concurrent write error. paddle::platform::CudaAtomicAdd( tensor_out_ptr + index, - -1.0 * learning_rate[0] * selected_rows_ptr[index]); + -static_cast(1.0) * learning_rate[0] * selected_rows_ptr[index]); } } } @@ -122,5 +122,7 @@ class SGDOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel, - ops::SGDOpCUDAKernel); + ops::SGDOpCUDAKernel, + ops::SGDOpCUDAKernel); diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py new file mode 100644 index 0000000000..5ccf855ebc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py @@ -0,0 +1,91 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle.fluid.core as core +import math +import os +import sys +import unittest + +import numpy as np +import paddle +import paddle.fluid as fluid +from simple_nets import init_data +from parallel_executor_test_base import TestParallelExecutorBase + +batch_size = 12 +img_shape = [1, 28, 28] + + +def loss_net(hidden, label): + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + return avg_loss + + +def conv_net(use_feed): + img = fluid.layers.data(name='image', shape=img_shape, dtype='float16') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) + + conv_pool_1 = fluid.layers.cast(conv_pool_1, np.float32) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + hidden = fluid.layers.cast(conv_pool_2, np.float32) + return loss_net(hidden, label) + + +def _optimizer(learning_rate=1e-6): + optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) + return optimizer + + +class TestResnet(TestParallelExecutorBase): + def check_model(self, use_cuda): + img, label = init_data( + batch_size=batch_size, img_shape=img_shape, label_range=9) + img = np.float16(img).view(np.uint16) + feed_dict = {"image": img, "label": label} + + TestParallelExecutorBase.check_network_convergence( + conv_net, + feed_dict=feed_dict, + iter=10, + use_cuda=use_cuda, + fuse_all_reduce_ops=True, + optimizer=_optimizer) + + def test_model(self): + if core.is_compiled_with_cuda(): + self.check_model(True) + + +if __name__ == '__main__': + unittest.main() -- GitLab