未验证 提交 fd3aad6c 编写于 作者: C chengduo 提交者: GitHub

Make fuse_optimizer_op_pass also work when the model contains sparse gradients. (#18664)

* support sparse gradients
test=develop
上级 6b78e00d
...@@ -95,5 +95,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS ...@@ -95,5 +95,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
fuse_elewise_add_act_pass multi_batch_merge_pass fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass memory_optimize_pass lock_free_optimize_pass
alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass) fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass)
...@@ -76,6 +76,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -76,6 +76,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
PADDLE_ENFORCE(!FLAGS_use_mkldnn, PADDLE_ENFORCE(!FLAGS_use_mkldnn,
"Please compile with MKLDNN first to use MKLDNN"); "Please compile with MKLDNN first to use MKLDNN");
#endif #endif
if (strategy_.enable_sequential_execution_) { if (strategy_.enable_sequential_execution_) {
VLOG(1) << "Add sequential_execution_pass"; VLOG(1) << "Add sequential_execution_pass";
AppendPass("sequential_execution_pass"); AppendPass("sequential_execution_pass");
...@@ -108,30 +109,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -108,30 +109,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
} }
// for single card training, fuse_all_reduce_ops is unnecessary. // for single card training, fuse_all_reduce_ops is unnecessary.
// alloc_continuous_space_for_grad_pass should be before of MultiDevPass. // coalesce_grad_tensor_pass should be before of MultiDevPass.
if (strategy_.fuse_all_reduce_ops_) { if (strategy_.fuse_all_reduce_ops_) {
VLOG(1) << "Add alloc_continuous_space_for_grad_pass"; VLOG(1) << "Add coalesce_grad_tensor_pass";
AppendPass("alloc_continuous_space_for_grad_pass"); AppendPass("coalesce_grad_tensor_pass");
} }
// Fuse all the optimization operators.
if (strategy_.is_distribution_) {
VLOG(3) << "Currently, fuse_all_optimizer_ops only works under "
"Non-distributed mode.";
strategy_.fuse_all_optimizer_ops_ = false;
}
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce ||
strategy_.is_distribution_) {
VLOG(3) << "Currently, fuse_all_optimizer_ops only works under AllReduce "
"mode.";
strategy_.fuse_all_optimizer_ops_ = false;
}
if (strategy_.fuse_all_optimizer_ops_) { if (strategy_.fuse_all_optimizer_ops_) {
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce || // NOTE: fuse_all_xx_ops will count the number of xx operator first,
strategy_.is_distribution_) { // if the number is zero, fuse_all_reduce_ops will do nothing.
VLOG(3) // Currently, only one type of optimization algorithm can be fused.
<< "Currently, fuse_all_optimizer_ops only works under AllReduce " VLOG(1) << "Add fuse_adam_op_pass";
"mode."; AppendPass("fuse_adam_op_pass");
strategy_.fuse_all_optimizer_ops_ = false; VLOG(1) << "Add fuse_sgd_op_pass";
} else { AppendPass("fuse_sgd_op_pass");
// NOTE: fuse_all_xx_ops will count the number of xx operator first, VLOG(1) << "Add fuse_momentum_op_pass";
// if the number is zero, fuse_all_reduce_ops will do nothing. AppendPass("fuse_momentum_op_pass");
// Currently, only one type of optimization algorithm can be fused.
VLOG(1) << "Add fuse_adam_op_pass";
AppendPass("fuse_adam_op_pass");
VLOG(1) << "Add fuse_sgd_op_pass";
AppendPass("fuse_sgd_op_pass");
VLOG(1) << "Add fuse_momentum_op_pass";
AppendPass("fuse_momentum_op_pass");
}
} }
// Add a graph viz pass to record a graph. // Add a graph viz pass to record a graph.
...@@ -301,7 +306,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, ...@@ -301,7 +306,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass->Erase(kNCCLCtxs); pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx); pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
#endif #endif
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass" || } else if (pass->Type() == "coalesce_grad_tensor_pass" ||
pass->Type() == "fuse_adam_op_pass" || pass->Type() == "fuse_adam_op_pass" ||
pass->Type() == "fuse_sgd_op_pass" || pass->Type() == "fuse_sgd_op_pass" ||
pass->Type() == "fuse_momentum_op_pass" || pass->Type() == "fuse_momentum_op_pass" ||
...@@ -321,7 +326,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, ...@@ -321,7 +326,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
new bool(use_hierarchical_allreduce_)); new bool(use_hierarchical_allreduce_));
#endif #endif
} }
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass") { } else if (pass->Type() == "coalesce_grad_tensor_pass") {
pass->Erase(kPlaces); pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places); pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase(kLocalScopes); pass->Erase(kLocalScopes);
...@@ -389,7 +394,7 @@ USE_PASS(backward_optimizer_op_deps_pass); ...@@ -389,7 +394,7 @@ USE_PASS(backward_optimizer_op_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(modify_op_lock_and_record_event_pass);
USE_PASS(inplace_pass); USE_PASS(inplace_pass);
USE_PASS(lock_free_optimize_pass); USE_PASS(lock_free_optimize_pass);
USE_PASS(alloc_continuous_space_for_grad_pass); USE_PASS(coalesce_grad_tensor_pass);
USE_PASS(graph_to_program_pass); USE_PASS(graph_to_program_pass);
USE_PASS(fuse_adam_op_pass); USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass); USE_PASS(fuse_sgd_op_pass);
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_bool(skip_fused_all_reduce_check, false, ""); DEFINE_bool(skip_fused_all_reduce_check, false, "");
...@@ -24,19 +25,6 @@ namespace paddle { ...@@ -24,19 +25,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
static size_t Alignment(size_t size, const platform::Place &place) {
// Allow to allocate the minimum chunk size is 4 KB.
size_t alignment = 1 << 12;
if (platform::is_gpu_place(place)) {
// Allow to allocate the minimum chunk size is 256 B.
alignment = 1 << 8;
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>> typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor; GradientAndLoDTensor;
...@@ -121,7 +109,7 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -121,7 +109,7 @@ void FusedAllReduceOpHandle::RunImpl() {
for (size_t k = 1; k < g_tensor.size(); ++k) { for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data<void>(); const void *cur_address = g_tensor.at(k - 1).second->data<void>();
int64_t len = g_tensor.at(k - 1).second->numel(); int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = Alignment(len * size_of_dtype, places_[0]); auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
void *infer_next_address = reinterpret_cast<void *>( void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset); reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data<void>(); const void *next_address = g_tensor.at(k).second->data<void>();
...@@ -241,8 +229,8 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( ...@@ -241,8 +229,8 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
// Get element number // Get element number
int64_t len = grad_tensor.at(i).second->numel(); int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0); PADDLE_ENFORCE_GT(len, 0);
// Alignment(len) *numel +=
*numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
} }
} }
......
...@@ -62,11 +62,15 @@ typedef std::vector<std::string> FusedGrads; ...@@ -62,11 +62,15 @@ typedef std::vector<std::string> FusedGrads;
constexpr char kFusedGrads[] = "fused_gradients"; constexpr char kFusedGrads[] = "fused_gradients";
typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads; typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
constexpr char kParamsAndGrads[] = "params_grads"; constexpr char kParamsAndDenseGrads[] = "params_and_dense_grads";
constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";
typedef std::vector<ProgramDesc> ProgramDescs;
constexpr char kProgramDescs[] = "program_descs";
typedef std::vector<std::vector<std::pair<std::string, std::string>>> typedef std::vector<std::vector<std::pair<std::string, std::string>>>
GroupParamsAndGrads; GroupParamsAndGrads;
constexpr char kGroupParamsAndGrads[] = "group_params_grads"; constexpr char kGroupParamsAndDenseGrads[] = "group_params_dense_grads";
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -70,6 +72,29 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() { ...@@ -70,6 +72,29 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
InitializeVariable(pair.first, pair.second); InitializeVariable(pair.first, pair.second);
} }
} }
const ir::Graph &graph = Graph();
if (graph.Has(details::kProgramDescs)) {
auto &program_descs =
graph.Get<details::ProgramDescs>(details::kProgramDescs);
// Init vars
auto &fused_grad_vars = graph.Get<details::FusedVars>(details::kFusedVars);
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
for (auto &var_name : fused_grad_vars) {
auto var = local_exec_scopes_[i]->Var(var_name);
var->GetMutable<LoDTensor>();
}
}
for (auto &program_desc : program_descs) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes_[i], places_[i]);
}
}
}
}
} }
void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() { void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
......
...@@ -45,7 +45,7 @@ cc_library(graph_traits SRCS graph_traits.cc DEPS graph) ...@@ -45,7 +45,7 @@ cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper) cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper)
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h" #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <string> #include <string>
...@@ -53,107 +53,109 @@ static constexpr double kMB = 1048576.0; ...@@ -53,107 +53,109 @@ static constexpr double kMB = 1048576.0;
void SetFuseParameterGroupsSize(int group_size) { void SetFuseParameterGroupsSize(int group_size) {
FLAGS_fuse_parameter_groups_size = group_size; FLAGS_fuse_parameter_groups_size = group_size;
} }
int GetFuseParameterGroupsSize() { return FLAGS_fuse_parameter_groups_size; } int GetFuseParameterGroupsSize() { return FLAGS_fuse_parameter_groups_size; }
void SetFuseParameterMemorySize(double memory_size) { void SetFuseParameterMemorySize(double memory_size) {
FLAGS_fuse_parameter_memory_size = memory_size; FLAGS_fuse_parameter_memory_size = memory_size;
} }
double GetFuseParameterMemorySize() { return FLAGS_fuse_parameter_memory_size; } double GetFuseParameterMemorySize() { return FLAGS_fuse_parameter_memory_size; }
class AllocContinuousSpaceForGradPass : public ir::Pass { class CoalesceGradTensorPass : public ir::Pass {
protected: protected:
void ApplyImpl(ir::Graph *graph) const { void ApplyImpl(ir::Graph *graph) const {
ir::Graph &result = *graph; ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(details::kPlaces); details::ParamsAndGrads params_grads;
auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
ResetAttribute<details::ParamsAndGrads>(details::kParamsAndGrads, &result);
ResetAttribute<details::GroupParamsAndGrads>(details::kGroupParamsAndGrads,
&result);
auto &params_grads =
result.Get<details::ParamsAndGrads>(details::kParamsAndGrads);
RecordParamsAndGrads(result, &params_grads); RecordParamsAndGrads(result, &params_grads);
auto num_params_grads = params_grads.size(); VLOG(10) << "The number of params and grads is:" << params_grads.size();
VLOG(10) << "The number of params and grads is:" << num_params_grads; if (params_grads.size() == 0) {
if (num_params_grads == 0) {
return; return;
} }
std::unordered_map<std::string, ir::Node *> var_name2node; auto vars_info = GetVarInfo(result);
std::unordered_map<std::string, std::unordered_set<ir::Node *>> ResetAttribute<details::ParamsAndGrads>(details::kParamsAndDenseGrads,
var_name2node_set; &result);
for (ir::Node *node : result.Nodes()) { ResetAttribute<details::ParamsAndGrads>(details::kParamsAndSparseGrads,
if (node->IsVar() && node->Var()) { &result);
// Note: The graph may have the same name node. For example, parameter ResetAttribute<details::GroupParamsAndGrads>(
// is the input of operator and it also is the output of optimizer; details::kGroupParamsAndDenseGrads, &result);
var_name2node.emplace(node->Var()->Name(), node); auto &p_g_dense_grad =
var_name2node_set[node->Var()->Name()].emplace(node); result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
auto &p_g_sparse_grad =
result.Get<details::ParamsAndGrads>(details::kParamsAndSparseGrads);
for (auto &param_grad : params_grads) {
if (IsLoDTensorType(GetTypeOfVar(vars_info, param_grad.second))) {
p_g_dense_grad.emplace_back(param_grad);
} else {
p_g_sparse_grad.emplace_back(param_grad);
} }
} }
auto &group_params_grads = VLOG(10) << "Dense grads: " << p_g_dense_grad.size()
result.Get<details::GroupParamsAndGrads>(details::kGroupParamsAndGrads); << ", Sparse grads: " << p_g_sparse_grad.size();
// Note: the order of params_grads may be changed by SetGroupParamsAndGrads. if (p_g_dense_grad.size() == 0) {
SetGroupParamsAndGrads(var_name2node, params_grads, &group_params_grads); return;
}
auto num_of_p_g_dense_grad = p_g_dense_grad.size();
auto &group_params_grads = result.Get<details::GroupParamsAndGrads>(
details::kGroupParamsAndDenseGrads);
// Note: the order of p_g_dense_grad may be changed by
// SetGroupParamsAndGrads.
SetGroupParamsAndGrads(vars_info, p_g_dense_grad, &group_params_grads);
params_grads.clear(); p_g_dense_grad.clear();
params_grads.reserve(num_params_grads); p_g_dense_grad.reserve(num_of_p_g_dense_grad);
for (auto &group_p_g : group_params_grads) { for (auto &group_p_g : group_params_grads) {
params_grads.insert(params_grads.end(), group_p_g.begin(), p_g_dense_grad.insert(p_g_dense_grad.end(), group_p_g.begin(),
group_p_g.end()); group_p_g.end());
} }
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
num_params_grads, params_grads.size(), p_g_dense_grad.size(), num_of_p_g_dense_grad,
"The number of params_grads is not consistent with before."); "The number of p_g_dense_grad is not consistent with before.");
if (IsUnifiedDtype(params_grads, var_name2node)) { if (IsUnifiedDtype(p_g_dense_grad, vars_info)) {
SetGradientPersistable(params_grads, var_name2node, var_name2node_set); SetGradientPersistable(p_g_dense_grad, vars_info);
AllocContinuousAddressSpace(places, local_scopes, var_name2node, CoalesceTensors(vars_info, p_g_dense_grad, &result);
params_grads, &result);
} else { } else {
// Set Gradients as Persistable to prevent this var becoming reusable.
for (auto &sub_param_grad : group_params_grads) { for (auto &sub_param_grad : group_params_grads) {
SetGradientPersistable(params_grads, var_name2node, var_name2node_set); SetGradientPersistable(p_g_dense_grad, vars_info);
PADDLE_ENFORCE(IsUnifiedDtype(sub_param_grad, var_name2node), PADDLE_ENFORCE(IsUnifiedDtype(sub_param_grad, vars_info),
"The data type of the same group is not consistent."); "The data type of the same group is not consistent.");
AllocContinuousAddressSpace(places, local_scopes, var_name2node, CoalesceTensors(vars_info, sub_param_grad, &result);
sub_param_grad, &result);
} }
} }
} }
void SetGradientPersistable( void SetGradientPersistable(
const std::vector<std::pair<std::string, std::string>> &sub_param_grad, const std::vector<std::pair<std::string, std::string>> &sub_param_grad,
const std::unordered_map<std::string, Node *> &var_name2node, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info)
const std::unordered_map<std::string, std::unordered_set<ir::Node *>> const {
&var_name2node_set) const {
for (auto &p_g : sub_param_grad) { for (auto &p_g : sub_param_grad) {
// Get gradient var auto iter = vars_info.find(p_g.second);
auto iter = var_name2node.find(p_g.second); PADDLE_ENFORCE(iter != vars_info.end(), "%s is not found.", p_g.second);
PADDLE_ENFORCE(iter != var_name2node.end(), "%s is not found.", PADDLE_ENFORCE(!iter->second.empty());
p_g.second);
// Set persistable // Set persistable
auto same_nodes = var_name2node_set.find(p_g.second); for (auto it : iter->second) {
PADDLE_ENFORCE(same_nodes != var_name2node_set.end(), "%s is not found.", PADDLE_ENFORCE_NOT_NULL(it->Var());
p_g.second);
for (auto it : same_nodes->second) {
it->Var()->SetPersistable(true); it->Var()->SetPersistable(true);
} }
PADDLE_ENFORCE(IsLoDTensorType(GetTypeOfVar(vars_info, p_g.second)));
PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
} }
} }
bool IsUnifiedDtype( bool IsUnifiedDtype(
const details::ParamsAndGrads &params_grads, const details::ParamsAndGrads &params_grads,
const std::unordered_map<std::string, Node *> &var_name2node) const { const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info)
auto dtype = const {
this->GetDtypeOfVar(var_name2node, params_grads.front().second); if (params_grads.empty()) return true;
auto dtype = GetDtypeOfVar(vars_info, params_grads.front().second);
for (auto p_g : params_grads) { for (auto p_g : params_grads) {
auto next_dtype = this->GetDtypeOfVar(var_name2node, p_g.second); auto next_dtype = GetDtypeOfVar(vars_info, p_g.second);
if (next_dtype != dtype) { if (next_dtype != dtype) {
return false; return false;
} }
...@@ -161,10 +163,8 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -161,10 +163,8 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
return true; return true;
} }
void AllocContinuousAddressSpace( void CoalesceTensors(
const std::vector<platform::Place> &places, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::vector<Scope *> &local_scopes,
const std::unordered_map<std::string, Node *> &var_name2node,
const details::ParamsAndGrads &params_grads, Graph *result) const { const details::ParamsAndGrads &params_grads, Graph *result) const {
// Create a FusedVarsSet to avoid duplicating names for fused_var in other // Create a FusedVarsSet to avoid duplicating names for fused_var in other
// pass. // pass.
...@@ -175,20 +175,22 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -175,20 +175,22 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
if (!result->Has(details::kFusedGrads)) { if (!result->Has(details::kFusedGrads)) {
result->Set(details::kFusedGrads, new details::FusedGrads); result->Set(details::kFusedGrads, new details::FusedGrads);
} }
if (!result->Has(details::kProgramDescs)) {
result->Set(details::kProgramDescs, new details::ProgramDescs);
}
// the fused_var_name should be unique, so it appends // the fused_var_name should be unique, so it appends
// params_grads.begin()->second. // params_grads.begin()->second.
auto fused_var_name = std::string(details::kFusedVarNamePrefix) + "@GRAD@" + auto fused_grad_var_name = std::string(details::kFusedVarNamePrefix) +
params_grads.begin()->second; "@GRAD@" + params_grads.begin()->second;
result->Get<details::FusedGrads>(details::kFusedGrads)
.emplace_back(fused_var_name);
auto &fused_var_set = result->Get<details::FusedVars>(details::kFusedVars); auto &fused_var_set = result->Get<details::FusedVars>(details::kFusedVars);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0, PADDLE_ENFORCE_EQ(fused_var_set.count(fused_grad_var_name), 0,
"%s is duplicate in FusedVars.", fused_var_name); "%s is duplicate in FusedVars.", fused_grad_var_name);
fused_var_set.insert(fused_var_name); fused_var_set.insert(fused_grad_var_name);
result->Get<details::FusedGrads>(details::kFusedGrads)
.emplace_back(fused_grad_var_name);
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, var_name2node, InitFusedVarsAndAllocSpaceForVars(vars_info, fused_grad_var_name,
fused_var_name, params_grads); params_grads, result);
} }
template <typename AttrType> template <typename AttrType>
...@@ -201,16 +203,18 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -201,16 +203,18 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
} }
void SetGroupParamsAndGrads( void SetGroupParamsAndGrads(
const std::unordered_map<std::string, ir::Node *> &var_nodes, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const details::ParamsAndGrads &params_grads, const details::ParamsAndGrads &params_grads,
details::GroupParamsAndGrads *group_params_grads) const { details::GroupParamsAndGrads *group_params_grads) const {
SetGroupAccordingToLayers(var_nodes, params_grads, group_params_grads); SetGroupAccordingToLayers(vars_info, params_grads, group_params_grads);
SetGroupAccordingToMemorySize(var_nodes, group_params_grads); SetGroupAccordingToMemorySize(vars_info, group_params_grads);
ReGroupByDtype(var_nodes, params_grads, group_params_grads); if (!IsUnifiedDtype(params_grads, vars_info)) {
ReGroupByDtype(vars_info, group_params_grads);
}
} }
void SetGroupAccordingToLayers( void SetGroupAccordingToLayers(
const std::unordered_map<std::string, ir::Node *> &var_nodes, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const details::ParamsAndGrads &params_grads, const details::ParamsAndGrads &params_grads,
details::GroupParamsAndGrads *group_params_grads) const { details::GroupParamsAndGrads *group_params_grads) const {
std::map<std::string, size_t> var_idx; std::map<std::string, size_t> var_idx;
...@@ -241,41 +245,38 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -241,41 +245,38 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(10)) {
VLOG(10) << "SetGroupAccordingToLayers: "; VLOG(10) << "SetGroupAccordingToLayers: ";
PrintGroupInfo(var_nodes, group_params_grads); PrintGroupInfo(vars_info, group_params_grads);
} }
} }
void PrintGroupInfo( void PrintGroupInfo(
const std::unordered_map<std::string, ir::Node *> &var_nodes, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
details::GroupParamsAndGrads *group_params_grads) const { details::GroupParamsAndGrads *group_params_grads) const {
for (size_t i = 0; i < group_params_grads->size(); ++i) { for (size_t i = 0; i < group_params_grads->size(); ++i) {
VLOG(10) << "group " << i; VLOG(10) << "group " << i;
std::stringstream out; std::stringstream out;
size_t gps_size = 0; size_t gps_size = 0;
for (auto &p_g : group_params_grads->at(i)) { for (auto &p_g : group_params_grads->at(i)) {
auto iter = var_nodes.find(p_g.first); auto var_desc = GetVarDescFromVarsInfo(vars_info, p_g.first);
PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", p_g.first); auto shape = var_desc->GetShape();
auto shape = iter->second->Var()->GetShape(); size_t size = framework::SizeOfType(var_desc->GetDataType());
size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
std::for_each(shape.begin(), shape.end(), std::for_each(shape.begin(), shape.end(),
[&size](const int64_t &n) { size *= n; }); [&size](const int64_t &n) { size *= n; });
gps_size += size; gps_size += size;
out << string::Sprintf("(%s(%d), %s)", p_g.first, size, p_g.second); out << string::Sprintf("(%s(%d), %s)", p_g.first, size, p_g.second);
} }
auto dtype = this->GetDtypeOfVar(var_nodes, auto dtype =
group_params_grads->at(i).front().first); GetDtypeOfVar(vars_info, group_params_grads->at(i).front().first);
VLOG(10) << out.str() VLOG(10) << out.str()
<< ", group size:" << group_params_grads->at(i).size() << ", group size:" << group_params_grads->at(i).size()
<< ", group memory size:" << static_cast<double>(gps_size) / kMB << ", group memory size:" << static_cast<double>(gps_size) / kMB
<< "(MB)" << "(MB), dtype:" << dtype;
<< ", dtype:" << dtype;
} }
} }
void SetGroupAccordingToMemorySize( void SetGroupAccordingToMemorySize(
const std::unordered_map<std::string, ir::Node *> &var_nodes, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
details::GroupParamsAndGrads *group_params_grads) const { details::GroupParamsAndGrads *group_params_grads) const {
const double group_memory_size = GetFuseParameterMemorySize(); const double group_memory_size = GetFuseParameterMemorySize();
if (group_memory_size <= 0.0) { if (group_memory_size <= 0.0) {
...@@ -290,28 +291,19 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -290,28 +291,19 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
size_t local_group_memory_size = 0; size_t local_group_memory_size = 0;
while (j < group_params_grads->size()) { while (j < group_params_grads->size()) {
std::for_each( for (auto &p_g_iter : group_params_grads->at(j)) {
group_params_grads->at(j).begin(), group_params_grads->at(j).end(), auto var_desc = GetVarDescFromVarsInfo(vars_info, p_g_iter.second);
[&local_group_memory_size, size_t size = framework::SizeOfType(var_desc->GetDataType());
&var_nodes](const std::pair<std::string, std::string> &p_g) { auto shape = var_desc->GetShape();
auto iter = var_nodes.find(p_g.second); std::for_each(shape.begin(), shape.end(),
PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", [&size](const int64_t &n) { size *= n; });
p_g.second); local_group_memory_size += size;
}
size_t size =
framework::SizeOfType(iter->second->Var()->GetDataType());
auto shape = iter->second->Var()->GetShape();
std::for_each(shape.begin(), shape.end(),
[&size](const int64_t &n) { size *= n; });
local_group_memory_size += size;
});
group_p_g.insert(group_p_g.end(), group_params_grads->at(j).begin(), group_p_g.insert(group_p_g.end(), group_params_grads->at(j).begin(),
group_params_grads->at(j).end()); group_params_grads->at(j).end());
++j; ++j;
if (j >= group_params_grads->size()) { if (j >= group_params_grads->size()) {
break; break;
} }
...@@ -333,20 +325,15 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -333,20 +325,15 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(10)) {
VLOG(10) << string::Sprintf( VLOG(10) << string::Sprintf(
"SetGroupAccordingToMemorySize(memory_size: %f):", group_memory_size); "SetGroupAccordingToMemorySize(memory_size: %f MB):",
PrintGroupInfo(var_nodes, group_params_grads); GetFuseParameterMemorySize());
PrintGroupInfo(vars_info, group_params_grads);
} }
} }
void ReGroupByDtype( void ReGroupByDtype(
const std::unordered_map<std::string, ir::Node *> &var_nodes, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const details::ParamsAndGrads &params_grads,
details::GroupParamsAndGrads *group_params_grads) const { details::GroupParamsAndGrads *group_params_grads) const {
if (IsUnifiedDtype(params_grads, var_nodes)) {
VLOG(1) << "needn't regroup fusion params_grads";
return;
}
details::GroupParamsAndGrads new_group_params_grads; details::GroupParamsAndGrads new_group_params_grads;
for (auto &group_p_g : *group_params_grads) { for (auto &group_p_g : *group_params_grads) {
...@@ -354,7 +341,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -354,7 +341,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
details::GroupParamsAndGrads local_group_params_grads; details::GroupParamsAndGrads local_group_params_grads;
for (auto &p_g : group_p_g) { for (auto &p_g : group_p_g) {
auto dtype = GetDtypeOfVar(var_nodes, p_g.second); auto dtype = GetDtypeOfVar(vars_info, p_g.second);
size_t idx = 0; size_t idx = 0;
auto var_idx_iter = type_idx.find(dtype); auto var_idx_iter = type_idx.find(dtype);
...@@ -383,25 +370,53 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -383,25 +370,53 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
VLOG(10) << string::Sprintf("ReGroupByDtype(memory_size: %f MB, %u):", VLOG(10) << string::Sprintf("ReGroupByDtype(memory_size: %f MB, %u):",
GetFuseParameterMemorySize(), GetFuseParameterMemorySize(),
GetFuseParameterGroupsSize()); GetFuseParameterGroupsSize());
PrintGroupInfo(var_nodes, group_params_grads); PrintGroupInfo(vars_info, group_params_grads);
} }
} }
proto::VarType::Type GetDtypeOfVar( proto::VarType::Type GetDtypeOfVar(
const std::unordered_map<std::string, Node *> &var_nodes, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &name) const {
auto var_desc = GetVarDescFromVarsInfo(vars_info, name);
return var_desc->GetDataType();
}
proto::VarType::Type GetTypeOfVar(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &name) const { const std::string &name) const {
auto grad_iter = var_nodes.find(name); auto var_desc = GetVarDescFromVarsInfo(vars_info, name);
PADDLE_ENFORCE(grad_iter != var_nodes.end()); return var_desc->GetType();
PADDLE_ENFORCE_NOT_NULL(grad_iter->second->Var());
return grad_iter->second->Var()->GetDataType();
} }
private: private:
bool IsSupportedVarType(const proto::VarType::Type &type) const { bool IsLoDTensorType(const proto::VarType::Type &type) const {
// Current only support LOD_TENSOR. // Current only support LOD_TENSOR.
return type == proto::VarType::LOD_TENSOR; return type == proto::VarType::LOD_TENSOR;
} }
std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
const Graph &result) const {
std::unordered_map<std::string, std::vector<Node *>> vars;
for (Node *node : result.Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars[node->Var()->Name()].emplace_back(node);
}
}
return vars;
}
const VarDesc *GetVarDescFromVarsInfo(
const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
const std::string &var_name) const {
auto grad_iter = vars_info.find(var_name);
PADDLE_ENFORCE(grad_iter != vars_info.end(), "%s is not found.", var_name);
PADDLE_ENFORCE(!grad_iter->second.empty());
PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
return grad_iter->second.front()->Var();
}
void RecordParamsAndGrads(const ir::Graph &graph, void RecordParamsAndGrads(const ir::Graph &graph,
details::ParamsAndGrads *params_grads) const { details::ParamsAndGrads *params_grads) const {
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(graph); std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(graph);
...@@ -431,30 +446,9 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -431,30 +446,9 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
} }
void InitFusedVarsAndAllocSpaceForVars( void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::vector<Scope *> &local_scopes,
const std::unordered_map<std::string, ir::Node *> &vars,
const std::string &fused_var_name, const std::string &fused_var_name,
const details::ParamsAndGrads &params_grads) const { const details::ParamsAndGrads &params_grads, ir::Graph *result) const {
// Init Gradients and FusedVars
VLOG(10) << "Init FusedVars and Gradients.";
for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
auto &scope = *it;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has existed in scope.", fused_var_name);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
for (auto &p_g : params_grads) {
auto iter = vars.find(p_g.second);
PADDLE_ENFORCE(iter != vars.end());
PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
proto::VarType::LOD_TENSOR);
scope->Var(p_g.second)->GetMutable<LoDTensor>();
}
}
// Alloc continuous space for vars. // Alloc continuous space for vars.
std::vector<std::string> grads_name; std::vector<std::string> grads_name;
std::vector<std::string> params_name; std::vector<std::string> params_name;
...@@ -464,16 +458,13 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -464,16 +458,13 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
params_name.emplace_back(p_g.first); params_name.emplace_back(p_g.first);
grads_name.emplace_back(p_g.second); grads_name.emplace_back(p_g.second);
} }
framework::ProgramDesc program_desc;
AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
program_desc.MutableBlock(0));
for (size_t i = 0; i < local_scopes.size(); ++i) { result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
for (auto &op_desc : program_desc.Block(0).AllOps()) { ProgramDesc &program_desc =
auto op = OpRegistry::CreateOp(*op_desc); result->Get<details::ProgramDescs>(details::kProgramDescs).back();
op->Run(*local_scopes[i], places[i]); auto *global_block = program_desc.MutableBlock(0);
} AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
} global_block);
} }
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name, void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
...@@ -481,7 +472,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -481,7 +472,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
const std::string &fused_var_name, const std::string &fused_var_name,
BlockDesc *global_block) const { BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp(); auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space"); op_desc->SetType("coalesce_tensor");
op_desc->SetInput("Input", params_name); op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name); op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name}); op_desc->SetOutput("FusedOutput", {fused_var_name});
...@@ -491,7 +482,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -491,7 +482,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
REGISTER_PASS(alloc_continuous_space_for_grad_pass, REGISTER_PASS(coalesce_grad_tensor_pass,
paddle::framework::ir::AllocContinuousSpaceForGradPass) paddle::framework::ir::CoalesceGradTensorPass)
.RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kLocalScopes); .RequirePassAttr(paddle::framework::details::kLocalScopes);
...@@ -25,9 +25,6 @@ namespace ir { ...@@ -25,9 +25,6 @@ namespace ir {
void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
ir::Graph &result = *graph; ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
const std::string fuse_op_type = GetOpType(); const std::string fuse_op_type = GetOpType();
std::vector<std::string> aux_var_names = GetAuxiliaryVarNames(); std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
aux_var_names.emplace_back(kParam); aux_var_names.emplace_back(kParam);
...@@ -35,70 +32,91 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -35,70 +32,91 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
// Step 1: Get the specified op and auxiliary variables. // Step 1: Get the specified op and auxiliary variables.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result); std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
std::unordered_map<std::string, std::vector<std::string>> aux_var_set; auto vars_info = GetVarInfo(result);
std::vector<ir::Node *> opt_ops; std::vector<ir::Node *> opt_nodes;
size_t opt_ops_num = 0;
// Note: Only take care about the dense gradients.
for (auto &node : topo_nodes) { for (auto &node : topo_nodes) {
GetSpecifiedOpsAndVars(fuse_op_type, aux_var_names, node, &opt_ops, if (node->Op()->Type() == fuse_op_type) {
&aux_var_set); auto grad_name = node->Op()->Input(kGrad);
} PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1));
if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) {
VLOG(6) << "Find " << fuse_op_type << " operators: " << opt_ops.size(); opt_nodes.emplace_back(node);
if (opt_ops.size() == 0) { }
return; ++opt_ops_num;
}
} }
if (result.Has(details::kFusedOptType)) { VLOG(6) << "Find " << fuse_op_type << " operators : " << opt_ops_num
VLOG(6) << "Currently only support fusing one type optimizer op. Has fused " << ", and " << opt_nodes.size() << " for dense gradients ";
<< result.Get<details::FusedOptType>(details::kFusedOptType); if (opt_nodes.size() == 0 || result.Has(details::kFusedOptType)) {
if (result.Has(details::kFusedOptType)) {
auto &opt_type =
result.Get<details::FusedOptType>(details::kFusedOptType);
VLOG(6) << "Currently only support fusing one type optimizer op. "
"Has fused "
<< opt_type;
}
return; return;
} else {
result.Set(details::kFusedOptType, new details::FusedOptType);
} }
result.Set(details::kFusedOptType, new details::FusedOptType);
result.Get<details::FusedOptType>(details::kFusedOptType) = fuse_op_type; result.Get<details::FusedOptType>(details::kFusedOptType) = fuse_op_type;
if (!result.Has(details::kProgramDescs)) {
result.Set(details::kProgramDescs, new details::ProgramDescs);
}
// Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be // Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
// initialized in scopes before execution. // initialized in scopes before execution.
if (!result.Has(details::kFusedVars)) { if (!result.Has(details::kFusedVars)) {
result.Set(details::kFusedVars, new details::FusedVars); result.Set(details::kFusedVars, new details::FusedVars);
} }
std::unordered_map<std::string, std::vector<std::string>> aux_var_set;
GetSpecifiedOpsAndVars(aux_var_names, opt_nodes, &aux_var_set);
std::unordered_map<std::string, std::string> fused_vars_name; std::unordered_map<std::string, std::string> fused_vars_name;
fused_vars_name.reserve(aux_var_names.size()); fused_vars_name.reserve(aux_var_names.size());
auto &fused_var_set = result.Get<details::FusedVars>(details::kFusedVars); auto &fused_var_set = result.Get<details::FusedVars>(details::kFusedVars);
const std::string prefix(details::kFusedVarNamePrefix); const std::string prefix(details::kFusedVarNamePrefix);
// NOTE: the fused_var_name should be unique.
for (auto &var_name : aux_var_names) { for (auto &var_name : aux_var_names) {
// NOTE: the fused_var_name should be unique.
auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" + auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
aux_var_set[var_name][0]; aux_var_set[var_name][0];
VLOG(6) << var_name << ": " << fused_var_name; VLOG(6) << var_name << ": " << fused_var_name;
fused_vars_name.emplace(var_name, fused_var_name);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
fused_var_set.insert(fused_var_name); fused_var_set.insert(fused_var_name);
fused_vars_name.emplace(var_name, fused_var_name);
} }
// Step 3: Get the fused Gradient's name // Step 3: Get the fused Gradient's name
bool grad_fused = false; bool grad_fused = false;
if (result.Has(details::kParamsAndGrads)) { if (result.Has(details::kParamsAndDenseGrads)) {
auto &params_grads = // NOTE: kParamsAndDenseGrads is generated by
result.Get<details::ParamsAndGrads>(details::kParamsAndGrads); // alloc_continue_space_for_grad_pass
PADDLE_ENFORCE_EQ( auto &params_and_dense_grads =
params_grads.size(), aux_var_set.at(kGrad).size(), result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
"The number of gradients and optimizer ops is not equal."); PADDLE_ENFORCE_LE(
std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).begin(), params_and_dense_grads.size(), aux_var_set.at(kGrad).size(),
aux_var_set.at(kGrad).end()); "The number of dense gradients should be little than optimizer ops.");
size_t same_grad_num = 0; std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).size());
for (auto &p_g : params_grads) { for (auto &p_g : params_and_dense_grads) {
if (opt_grad_set.count(p_g.second)) { opt_grad_set.insert(p_g.second);
++same_grad_num; }
std::vector<size_t> new_grad_idx;
for (size_t idx = 0; idx < aux_var_set.at(kGrad).size(); ++idx) {
auto &grad = aux_var_set.at(kGrad).at(idx);
if (!opt_grad_set.count(grad)) {
new_grad_idx.emplace_back(idx);
} }
} }
// NOTE(zcd): the gradient of kParamsAndGrads may be different with the // NOTE(zcd): the gradient of kParamsAndDenseGrads may be different
// kGrad. // with the kGrad. The gradients of kParamsAndDenseGrads is
if (same_grad_num == aux_var_set.at(kGrad).size()) { // collected during backward stage, but in optimization state, the
// some gradient's name maybe changed.
if (new_grad_idx.size() == 0) {
if (!result.Has(details::kFusedGrads)) { if (!result.Has(details::kFusedGrads)) {
PADDLE_THROW( PADDLE_THROW(
"The alloc_continuous_space_for_grad_pass should be called before " "The coalesce_grad_tensor_pass should "
"this pass."); "be called before this pass.");
} }
auto &fused_grad = result.Get<details::FusedGrads>(details::kFusedGrads); auto &fused_grad = result.Get<details::FusedGrads>(details::kFusedGrads);
PADDLE_ENFORCE_NE(fused_grad.size(), 0, PADDLE_ENFORCE_NE(fused_grad.size(), 0,
...@@ -115,136 +133,146 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -115,136 +133,146 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
// Sort the parameters and auxiliary variables according // Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly. // to parameters' name to make variables' name correspond correctly.
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); SortParametersAndAuxVars(params_and_dense_grads, &aux_var_set,
&opt_nodes);
grad_fused = true; grad_fused = true;
} else {
VLOG(10) << "The number of new gradients is " << new_grad_idx.size();
if (new_grad_idx.size() == 1) return;
// NOTE(zcd): If the gradients of backward stage and optimization stage
// have diff, Only take care of the the gradient of optimization stage.
GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_set);
} }
} }
// Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g. // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately. // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops
aux_var_names.pop_back(); // separately.
if (!grad_fused) { if (!grad_fused) {
InitFusedGradsAndAllocSpaceForGrads( InitFusedGradsAndAllocSpaceForGrads(aux_var_set.at(kParam),
places, local_scopes, aux_var_set.at(kParam), aux_var_set.at(kGrad), aux_var_set.at(kGrad),
fused_vars_name.at(kGrad), &result); fused_vars_name.at(kGrad), &result);
} }
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names, aux_var_names.pop_back();
aux_var_set, fused_vars_name); InitFusedVarsAndAllocSpaceForVars(aux_var_names, aux_var_set, fused_vars_name,
&result);
// Step 5: Fuse optimizer Ops and Scale Ops // Step 5: Fuse optimizer Ops and Scale Ops
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result); FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result);
// Step 6: Remove optimizer Ops // Step 6: Remove optimizer Ops
for (auto &opt_op : opt_ops) { for (auto &opt_op : opt_nodes) {
graph->RemoveNode(opt_op); graph->RemoveNode(opt_op);
} }
} }
void FuseOptimizerOpPass::GradientsFilter(
const std::vector<size_t> &new_grad_idx, std::vector<Node *> *opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> *aux_var_set)
const {
for (auto &aux_vars : *aux_var_set) {
std::vector<std::string> sorted_vars;
sorted_vars.reserve(aux_vars.second.size());
for (size_t i : new_grad_idx) {
sorted_vars.emplace_back(aux_vars.second.at(i));
}
std::swap(aux_vars.second, sorted_vars);
if (VLOG_IS_ON(6)) {
std::stringstream out;
for (auto &var_name : aux_vars.second) {
out << var_name << " ";
}
VLOG(6) << aux_vars.first << ": " << out.str();
}
}
std::vector<Node *> sorted_ops;
for (size_t i : new_grad_idx) {
sorted_ops.emplace_back(opt_nodes->at(i));
}
std::swap(*opt_nodes, sorted_ops);
}
void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &params, const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name, const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const { ir::Graph *result) const {
// Get Var Nodes auto vars_info = GetVarInfo(*result);
std::unordered_map<std::string, ir::Node *> vars;
for (ir::Node *node : result->Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars.emplace(node->Var()->Name(), node);
}
}
// Set Gradients as Persistable to prevent this var becoming reusable. // Set Gradients as Persistable to prevent this var becoming reusable.
for (auto &grad_var_name : grads) { for (auto &grad_var_name : grads) {
auto iter = vars.find(grad_var_name); auto iter = vars_info.find(grad_var_name);
PADDLE_ENFORCE(iter != vars.end()); PADDLE_ENFORCE(iter != vars_info.end());
PADDLE_ENFORCE_NOT_NULL(iter->second->Var()); PADDLE_ENFORCE(!iter->second.empty());
PADDLE_ENFORCE(iter->second->Var()->GetType() == proto::VarType::LOD_TENSOR, PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var());
PADDLE_ENFORCE(IsLoDTensorType(iter->second.front()->Var()->GetType()),
"Currently the gradient type only should be LoDTensor when " "Currently the gradient type only should be LoDTensor when "
"fusing optimizer ops."); "fusing optimizer ops.");
iter->second->Var()->SetPersistable(true); for (auto var : iter->second) {
} var->Var()->SetPersistable(true);
// Init Grads
for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
auto &scope = *it;
VLOG(6) << "Init: " << fused_grad_name;
PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr,
"%s has existed in scope.", fused_grad_name);
scope->Var(fused_grad_name)->GetMutable<LoDTensor>();
for (auto &grad_var_name : grads) {
auto iter = vars.find(grad_var_name);
PADDLE_ENFORCE(iter != vars.end());
PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
scope->Var(grad_var_name)->GetMutable<LoDTensor>();
} }
} }
// Define Ops // Define Ops
ProgramDesc program_desc; result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0); auto *global_block = program_desc.MutableBlock(0);
AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block, AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
false, false); false, false);
// Run Ops }
RunInitOps(places, local_scopes, *global_block);
std::unordered_map<std::string, std::vector<Node *>>
FuseOptimizerOpPass::GetVarInfo(const Graph &result) const {
std::unordered_map<std::string, std::vector<Node *>> vars;
for (Node *node : result.Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars[node->Var()->Name()].emplace_back(node);
}
}
return vars;
}
bool FuseOptimizerOpPass::IsLoDTensorType(
const proto::VarType::Type &type) const {
// Current only support LOD_TENSOR.
return type == proto::VarType::LOD_TENSOR;
}
proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar(
const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
const std::string &name) const {
auto grad_iter = var_nodes.find(name);
PADDLE_ENFORCE(grad_iter != var_nodes.end());
PADDLE_ENFORCE(grad_iter->second.size() > 0);
PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
return grad_iter->second.front()->Var()->GetType();
} }
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &aux_var_names, const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>> const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const { const std::unordered_map<std::string, std::string> &fused_vars_name,
// Init Vars ir::Graph *result) const {
for (auto &var_name : aux_var_names) {
auto &fused_var_name = fused_vars_name.at(var_name);
InitVars(local_scopes, fused_var_name);
}
// Define Ops // Define Ops
ProgramDesc program_desc; result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0); auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) { for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace( AppendAllocContinuousSpace(
aux_var_set.at(var_name), aux_var_set.at(var_name), aux_var_set.at(var_name), aux_var_set.at(var_name),
fused_vars_name.at(var_name), global_block, true); fused_vars_name.at(var_name), global_block, true);
} }
// Run Ops
RunInitOps(places, local_scopes, *global_block);
}
void FuseOptimizerOpPass::RunInitOps(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const BlockDesc &global_block) const {
for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : global_block.AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]);
}
}
}
void FuseOptimizerOpPass::InitVars(const std::vector<Scope *> &local_scopes,
const std::string &fused_var_name) const {
// Alloc parameters and auxiliary vars in the respective scope.
size_t idx = local_scopes.size();
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
++iter, --idx) {
auto &scope = *iter;
VLOG(6) << "Init: " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
} }
void FuseOptimizerOpPass::SortParametersAndAuxVars( void FuseOptimizerOpPass::SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads, const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set, std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
std::vector<ir::Node *> *ops) const { std::vector<ir::Node *> *ops) const {
PADDLE_ENFORCE_NE(aux_vars_set->count("Param"), static_cast<size_t>(0)); PADDLE_ENFORCE_NE(aux_vars_set->count(kParam), static_cast<size_t>(0));
auto &param_vec = aux_vars_set->at("Param"); auto &param_vec = aux_vars_set->at(kParam);
std::vector<size_t> param_sort_idx; std::vector<size_t> param_sort_idx;
param_sort_idx.reserve(param_vec.size()); param_sort_idx.reserve(param_vec.size());
...@@ -264,11 +292,13 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars( ...@@ -264,11 +292,13 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
} }
std::swap(aux_vars.second, sorted_vars); std::swap(aux_vars.second, sorted_vars);
std::stringstream out; if (VLOG_IS_ON(6)) {
for (auto &var_name : aux_vars.second) { std::stringstream out;
out << var_name << " "; for (auto &var_name : aux_vars.second) {
out << var_name << " ";
}
VLOG(6) << aux_vars.first << ": " << out.str();
} }
VLOG(6) << aux_vars.first << ": " << out.str();
} }
std::vector<ir::Node *> sorted_ops; std::vector<ir::Node *> sorted_ops;
...@@ -280,21 +310,19 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars( ...@@ -280,21 +310,19 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
} }
void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
const std::string &op_type, const std::vector<std::string> &aux_vars_name, const std::vector<std::string> &aux_vars_name,
ir::Node *node, std::vector<ir::Node *> *ops, const std::vector<ir::Node *> &opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name) std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const { const {
if (node->Op()->Type() != op_type) return; for (auto &node : opt_nodes) {
std::stringstream out;
std::stringstream out; for (auto &var_n : aux_vars_name) {
for (auto &var_n : aux_vars_name) { auto arg_names = node->Op()->Input(var_n);
auto arg_names = node->Op()->Input(var_n); PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1));
PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1)); (*aux_args_name)[var_n].emplace_back(arg_names[0]);
(*aux_args_name)[var_n].emplace_back(arg_names[0]); out << var_n << ", " << arg_names[0] << "; ";
out << var_n << ", " << arg_names[0] << "; "; }
} }
VLOG(7) << out.str();
ops->emplace_back(node);
} }
void FuseOptimizerOpPass::AppendAllocContinuousSpace( void FuseOptimizerOpPass::AppendAllocContinuousSpace(
...@@ -302,7 +330,7 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace( ...@@ -302,7 +330,7 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const std::vector<std::string> &out_args, const std::string &fused_out_arg, const std::vector<std::string> &out_args, const std::string &fused_out_arg,
BlockDesc *global_block, bool copy_data, bool check_name) const { BlockDesc *global_block, bool copy_data, bool check_name) const {
auto op_desc = global_block->AppendOp(); auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space"); op_desc->SetType("coalesce_tensor");
op_desc->SetInput("Input", in_args); op_desc->SetInput("Input", in_args);
op_desc->SetOutput("Output", out_args); op_desc->SetOutput("Output", out_args);
op_desc->SetOutput("FusedOutput", {fused_out_arg}); op_desc->SetOutput("FusedOutput", {fused_out_arg});
...@@ -311,10 +339,10 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace( ...@@ -311,10 +339,10 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace(
} }
void FuseOptimizerOpPass::InserInputAndOutputForOptOps( void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
const std::vector<ir::Node *> &opt_ops, ir::Node *opt_node) const { const std::vector<ir::Node *> &opt_nodes, ir::Node *opt_node) const {
std::unordered_set<ir::Node *> inputs; std::unordered_set<ir::Node *> inputs;
std::unordered_set<ir::Node *> outputs; std::unordered_set<ir::Node *> outputs;
for (auto opt_op : opt_ops) { for (auto opt_op : opt_nodes) {
// set inputs // set inputs
inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end()); inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end());
for (auto &input : opt_op->inputs) { for (auto &input : opt_op->inputs) {
......
...@@ -55,8 +55,8 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -55,8 +55,8 @@ class FuseOptimizerOpPass : public ir::Pass {
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0; const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
void GetSpecifiedOpsAndVars( void GetSpecifiedOpsAndVars(
const std::string &op_type, const std::vector<std::string> &aux_vars_name, const std::vector<std::string> &aux_vars_name,
ir::Node *node, std::vector<ir::Node *> *ops, const std::vector<ir::Node *> &opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name) std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const; const;
...@@ -67,27 +67,30 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -67,27 +67,30 @@ class FuseOptimizerOpPass : public ir::Pass {
bool check_name = true) const; bool check_name = true) const;
void InitFusedGradsAndAllocSpaceForGrads( void InitFusedGradsAndAllocSpaceForGrads(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &params, const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name, const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const; ir::Graph *result) const;
void InitFusedVarsAndAllocSpaceForVars( void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &aux_var_names, const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>> const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const std::unordered_map<std::string, std::string> &fused_vars_name,
const; ir::Graph *result) const;
std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
const Graph &result) const;
proto::VarType::Type GetTypeOfVar(
const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
const std::string &name) const;
void RunInitOps(const std::vector<platform::Place> &places, void GradientsFilter(const std::vector<size_t> &new_grad_idx,
const std::vector<Scope *> &local_scopes, std::vector<Node *> *opt_nodes,
const BlockDesc &global_block) const; std::unordered_map<std::string, std::vector<std::string>>
*aux_var_set) const;
void InitVars(const std::vector<Scope *> &local_scopes, bool IsLoDTensorType(const proto::VarType::Type &type) const;
const std::string &fused_var_name) const;
}; };
} // namespace ir } // namespace ir
......
...@@ -108,8 +108,6 @@ bool VarDescIsConsistency(const Graph &graph) { ...@@ -108,8 +108,6 @@ bool VarDescIsConsistency(const Graph &graph) {
var_name2node_set; var_name2node_set;
for (ir::Node *node : graph.Nodes()) { for (ir::Node *node : graph.Nodes()) {
if (node->IsVar() && node->Var()) { if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
var_name2node_set[node->Var()->Name()].emplace(node); var_name2node_set[node->Var()->Name()].emplace(node);
} }
} }
......
...@@ -38,6 +38,10 @@ struct NodeComp { ...@@ -38,6 +38,10 @@ struct NodeComp {
bool HasCircle(const Graph &graph); bool HasCircle(const Graph &graph);
// Check if the var desc of node is consistency. // Check if the var desc of node is consistency.
// The graph may have the same name node, for example, parameter
// is the input of operator and it also is the output of optimizer.
// For the persistable variable, the var_desc of the nodes with
// the same node name should be equal.
bool VarDescIsConsistency(const Graph &graph); bool VarDescIsConsistency(const Graph &graph);
// Find All Circles for debugging, // Find All Circles for debugging,
......
...@@ -38,7 +38,7 @@ class FuseAllReduceOpPass : public ir::Pass { ...@@ -38,7 +38,7 @@ class FuseAllReduceOpPass : public ir::Pass {
#endif #endif
auto &params_grads = auto &params_grads =
result.Get<details::ParamsAndGrads>(details::kParamsAndGrads); result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
size_t num_of_all_reduce = params_grads.size(); size_t num_of_all_reduce = params_grads.size();
std::unordered_set<std::string> grads; std::unordered_set<std::string> grads;
grads.reserve(num_of_all_reduce); grads.reserve(num_of_all_reduce);
...@@ -60,8 +60,8 @@ class FuseAllReduceOpPass : public ir::Pass { ...@@ -60,8 +60,8 @@ class FuseAllReduceOpPass : public ir::Pass {
"it is not supported currently."); "it is not supported currently.");
VLOG(10) << "Insert fused_all_reduce"; VLOG(10) << "Insert fused_all_reduce";
auto &group_params_grads = auto &group_params_grads = graph->Get<details::GroupParamsAndGrads>(
graph->Get<details::GroupParamsAndGrads>(details::kGroupParamsAndGrads); details::kGroupParamsAndDenseGrads);
for (auto &group_p_g : group_params_grads) { for (auto &group_p_g : group_params_grads) {
size_t group_size = group_p_g.size(); size_t group_size = group_p_g.size();
......
...@@ -49,7 +49,7 @@ class Node { ...@@ -49,7 +49,7 @@ class Node {
public: public:
virtual ~Node() { virtual ~Node() {
if (!wrapper_.empty()) { if (!wrapper_.empty()) {
VLOG(4) << "ir::Node deleting a wrapper node " << Name(); VLOG(10) << "ir::Node deleting a wrapper node " << Name();
wrapper_deleter_(); wrapper_deleter_();
} }
} }
......
...@@ -33,16 +33,12 @@ Graph* Pass::Apply(Graph* graph) const { ...@@ -33,16 +33,12 @@ Graph* Pass::Apply(Graph* graph) const {
PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.", PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
attr); attr);
} }
auto* native_graph = graph;
ApplyImpl(graph); ApplyImpl(graph);
// TODO(panyx0718): Add more verifications. // TODO(panyx0718): Add more verifications.
PADDLE_ENFORCE(!HasCircle(*graph), PADDLE_ENFORCE(!HasCircle(*graph),
"Illegal Pass. Generated graph shouldn't has cycle."); "Illegal Pass. Generated graph shouldn't has cycle.");
PADDLE_ENFORCE(VarDescIsConsistency(*graph), PADDLE_ENFORCE(VarDescIsConsistency(*graph),
"The VarDescs of persistable variable are not consistency."); "The VarDescs of persistable variable are not consistency.");
PADDLE_ENFORCE(graph == native_graph,
"Pass::Apply() cannot delete the passed graph and shouldn't "
"return a new graph.(For the need of pybind11)");
applied_ = true; applied_ = true;
return graph; return graph;
} }
......
...@@ -88,6 +88,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code ...@@ -88,6 +88,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code
if (WITH_GPU) if (WITH_GPU)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
endif() endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
# FIXME(typhoonzero): operator deps may not needed. # FIXME(typhoonzero): operator deps may not needed.
# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -26,7 +27,7 @@ static framework::proto::VarType::Type kDefaultDtype = ...@@ -26,7 +27,7 @@ static framework::proto::VarType::Type kDefaultDtype =
framework::proto::VarType::Type::VarType_Type_BOOL; framework::proto::VarType::Type::VarType_Type_BOOL;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AllocContinuousSpaceKernel : public framework::OpKernel<T> { class CoalesceTensorOp : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto &in_var_names = context.Inputs("Input"); auto &in_var_names = context.Inputs("Input");
...@@ -86,8 +87,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -86,8 +87,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx, framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor); &sub_tensor);
offset += offset += platform::Alignment(len * size_of_dtype, context.GetPlace()) /
Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype; size_of_dtype;
} }
} else if (context.Attr<bool>("set_constant")) { } else if (context.Attr<bool>("set_constant")) {
math::SetConstant<DeviceContext, T> set_constant; math::SetConstant<DeviceContext, T> set_constant;
...@@ -106,7 +107,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -106,7 +107,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
->ShareDataWith(fused_tensor->Slice( ->ShareDataWith(fused_tensor->Slice(
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len))) static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim); .Resize(dim);
len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype; len = platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype;
offset += len; offset += len;
ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")" ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")"
<< " address: " << out_tensors[i]->data<void>() << ", "; << " address: " << out_tensors[i]->data<void>() << ", ";
...@@ -115,19 +117,6 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -115,19 +117,6 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
} }
private: private:
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
size_t Alignment(size_t size, const platform::Place &place) const {
// Allow to allocate the minimum chunk size is 4 KB.
size_t alignment = 1 << 12;
if (platform::is_gpu_place(place)) {
// Allow to allocate the minimum chunk size is 256 B.
alignment = 1 << 8;
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
void GetMemSizeAndDtype( void GetMemSizeAndDtype(
const std::vector<const framework::LoDTensor *> &lod_tensors, const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel, const std::vector<std::string> var_names, size_t *numel,
...@@ -156,7 +145,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -156,7 +145,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims() ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
<< "), "; << "), ";
*numel += Alignment(static_cast<size_t>(size) * size_of_dtype, place) / *numel += platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
place) /
size_of_dtype; size_of_dtype;
} }
...@@ -176,17 +166,17 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -176,17 +166,17 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddInput("Input", AddInput("Input",
"(vector<LoDTensor>) The input tensors of" "(vector<LoDTensor>) The input tensors of"
" alloc_continuous_space operator.") " coalesce_tensor operator.")
.AsDuplicable(); .AsDuplicable();
AddOutput("Output", AddOutput("Output",
"(vector<LoDTensor>) The output " "(vector<LoDTensor>) The output "
"tensors of alloc_continuous_space operator. And the address " "tensors of coalesce_tensor operator. And the address "
"of output tensors are continuous, they are sliced from the " "of output tensors are continuous, they are sliced from the "
"tensor of FusedOutput.") "tensor of FusedOutput.")
.AsDuplicable(); .AsDuplicable();
AddOutput("FusedOutput", AddOutput("FusedOutput",
"(LoDTensor) The output tensor " "(LoDTensor) The output tensor "
"of alloc_continuous_space operator. And the tensors of" "of coalesce_tensor operator. And the tensors of"
" Output is sliced from the tensor of FusedOutput."); " Output is sliced from the tensor of FusedOutput.");
AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.") AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.")
.SetDefault(false); .SetDefault(false);
...@@ -204,7 +194,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -204,7 +194,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
AllocContinuousSpace Operator. AllocContinuousSpace Operator.
alloc_continuous_space is used to make the address of Output coalesce_tensor is used to make the address of Output
continuous according to the Input. This Op will alloc a big tensor continuous according to the Input. This Op will alloc a big tensor
according to the tensors of Input, the dtype is the same with those input tensors, according to the tensors of Input, the dtype is the same with those input tensors,
the size is the sum of those input tensors' numel, and the dim of the big the size is the sum of those input tensors' numel, and the dim of the big
...@@ -213,7 +203,7 @@ The tensors of Output are sliced from the tensor of FusedOutput. ...@@ -213,7 +203,7 @@ The tensors of Output are sliced from the tensor of FusedOutput.
Note that, the dtype of Input should be the same, and the dim of Input Note that, the dtype of Input should be the same, and the dim of Input
and Output should equal. and Output should equal.
The tensors of Input and Output could be the same or different. And The tensors of Input and Output could be the same or different. And
alloc_continuous_space allows copying the value of Input to Output, or coalesce_tensor allows copying the value of Input to Output, or
setting the Output with a constant value. setting the Output with a constant value.
)DOC"); )DOC");
...@@ -223,27 +213,22 @@ setting the Output with a constant value. ...@@ -223,27 +213,22 @@ setting the Output with a constant value.
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OPERATOR(alloc_continuous_space, REGISTER_OPERATOR(coalesce_tensor, paddle::operators::AllocContinuousSpaceOp,
paddle::operators::AllocContinuousSpaceOp,
paddle::operators::AllocContinuousSpaceOpMaker); paddle::operators::AllocContinuousSpaceOpMaker);
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
alloc_continuous_space, coalesce_tensor,
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, plat::float16>,
plat::float16>, ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, int>,
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, int>, ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, float>,
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, float>, ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, double>);
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext,
double>);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
alloc_continuous_space, coalesce_tensor,
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, plat::float16>,
plat::float16>, ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, int>,
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, int>, ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, float>,
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, float>, ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, double>);
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext,
double>);
#endif #endif
...@@ -102,17 +102,17 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri ...@@ -102,17 +102,17 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
if(WITH_GPU) if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce) nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else() else()
cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce) cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif() endif()
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
IF(WITH_GPU)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
ENDIF()
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
if(WITH_GPU) if(WITH_GPU)
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_memory_aligment.h"
namespace paddle {
namespace platform {
size_t Alignment(size_t size, const platform::Place &place) {
size_t alignment = 1024;
if (platform::is_cpu_place(place)) {
alignment = CpuMinChunkSize();
} else {
#ifdef PADDLE_WITH_CUDA
alignment = GpuMinChunkSize();
#else
PADDLE_THROW("Fluid is not compiled with CUDA");
#endif
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif
namespace paddle {
namespace platform {
size_t Alignment(size_t size, const platform::Place &place);
} // namespace platform
} // namespace paddle
...@@ -27,7 +27,7 @@ limitations under the License. */ ...@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h" #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
......
...@@ -38,6 +38,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -38,6 +38,7 @@ class TestParallelExecutorBase(unittest.TestCase):
batch_size=None, batch_size=None,
allow_op_delay=False, allow_op_delay=False,
feed_dict=None, feed_dict=None,
get_data_from_feeder=None,
seed=None, seed=None,
use_parallel_executor=True, use_parallel_executor=True,
use_reduce=False, use_reduce=False,
...@@ -74,6 +75,10 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -74,6 +75,10 @@ class TestParallelExecutorBase(unittest.TestCase):
if memory_opt: if memory_opt:
fluid.memory_optimize(main) fluid.memory_optimize(main)
if get_data_from_feeder is not None:
assert feed_dict is None
feed_dict = get_data_from_feeder()
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup) exe.run(startup)
...@@ -81,6 +86,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -81,6 +86,7 @@ class TestParallelExecutorBase(unittest.TestCase):
exec_strategy.allow_op_delay = allow_op_delay exec_strategy.allow_op_delay = allow_op_delay
if use_fast_executor: if use_fast_executor:
exec_strategy.use_experimental_executor = True exec_strategy.use_experimental_executor = True
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
......
...@@ -55,6 +55,34 @@ def fc_with_batchnorm(use_feed=None): ...@@ -55,6 +55,34 @@ def fc_with_batchnorm(use_feed=None):
return loss return loss
def bow_net(use_feed,
dict_dim,
is_sparse=False,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
fluid/PaddleNLP/text_classification/nets.py
"""
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
emb = fluid.layers.embedding(
input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
bow_tanh = fluid.layers.tanh(bow)
fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
return avg_cost
def init_data(batch_size=32, img_shape=[784], label_range=9): def init_data(batch_size=32, img_shape=[784], label_range=9):
np.random.seed(5) np.random.seed(5)
assert isinstance(img_shape, list) assert isinstance(img_shape, list)
......
...@@ -24,7 +24,7 @@ alignment = 256 ...@@ -24,7 +24,7 @@ alignment = 256
class TestAllocContinuousSpace(OpTest): class TestAllocContinuousSpace(OpTest):
def setUp(self): def setUp(self):
self.op_type = "alloc_continuous_space" self.op_type = "coalesce_tensor"
self.dtype = np.float32 self.dtype = np.float32
attrs = self.init_attr() attrs = self.init_attr()
self.copy_data = attrs["copy_data"] self.copy_data = attrs["copy_data"]
...@@ -64,14 +64,13 @@ class TestAllocContinuousSpace(OpTest): ...@@ -64,14 +64,13 @@ class TestAllocContinuousSpace(OpTest):
out[0:length] = input[1].flatten() out[0:length] = input[1].flatten()
inputs.append(out) inputs.append(out)
alloc_continuous_space_var = np.concatenate([input for input in inputs]) coalesce_tensor_var = np.concatenate([input for input in inputs])
if set_constant: if set_constant:
alloc_continuous_space_var = np.ones( coalesce_tensor_var = np.ones((len(coalesce_tensor_var))) * constant
(len(alloc_continuous_space_var))) * constant
outputs = [(out[0], outputs = [(out[0],
np.ones(out[1].shape).astype(self.dtype) * constant) np.ones(out[1].shape).astype(self.dtype) * constant)
for out in outputs] for out in outputs]
return outputs, alloc_continuous_space_var return outputs, coalesce_tensor_var
def test_check_output(self): def test_check_output(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
......
...@@ -11,71 +11,122 @@ ...@@ -11,71 +11,122 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from simple_nets import simple_fc_net, fc_with_batchnorm, init_data from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
from fake_reader import fake_imdb_reader
from parallel_executor_test_base import TestParallelExecutorBase from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy as np from functools import partial
import paddle import paddle
import paddle.dataset.mnist as mnist
import unittest import unittest
import os import os
class TestMNIST(TestParallelExecutorBase): class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
def _init_data(self, random=True): def compare_fuse_all_reduce_ops(self,
np.random.seed(5) model,
if random: use_cuda,
img = np.random.random(size=[32, 784]).astype(np.float32) init_feed_dicta=None,
else: get_data_from_feeder=None,
img = np.ones(shape=[32, 784], dtype='float32') optimizer=None,
label = np.ones(shape=[32, 1], dtype='int64') fuse_all_optimizer_ops=False):
return img, label
def _compare_fuse_all_reduce_ops(self, model, use_cuda):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
img, label = init_data()
def _optimizer(learning_rate=1e-6): feed_dict_data = None
optimizer = fluid.optimizer.SGD( if init_feed_dicta is not None:
learning_rate=learning_rate, img, label = init_feed_dicta()
regularization=fluid.regularizer.L2Decay(1e-6)) feed_dict_data = {"image": img, "label": label}
return optimizer
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict=feed_dict_data,
"label": label}, get_data_from_feeder=get_data_from_feeder,
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_all_reduce_ops=False, fuse_all_reduce_ops=False,
fuse_all_optimizer_ops=fuse_all_optimizer_ops,
memory_opt=False, memory_opt=False,
optimizer=_optimizer) optimizer=optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict=feed_dict_data,
"label": label}, get_data_from_feeder=get_data_from_feeder,
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_all_reduce_ops=True, fuse_all_reduce_ops=True,
fuse_all_optimizer_ops=fuse_all_optimizer_ops,
memory_opt=False, memory_opt=False,
optimizer=_optimizer) optimizer=optimizer)
for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def test_simple_fc_with_fuse_op(self): def optimizer(self, learning_rate=1e-3):
self._compare_fuse_all_reduce_ops(simple_fc_net, True) optimizer = fluid.optimizer.SGD(
self._compare_fuse_all_reduce_ops(simple_fc_net, False) learning_rate=learning_rate,
regularization=fluid.regularizer.L2Decay(1e-3))
return optimizer
class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
def _decorate_compare_fused_all_reduce(self, model, use_cuda):
self.compare_fuse_all_reduce_ops(
model,
use_cuda,
init_feed_dicta=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)
def test_simple_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(simple_fc_net, True)
self._decorate_compare_fused_all_reduce(simple_fc_net, False)
def test_batchnorm_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(fc_with_batchnorm, True)
self._decorate_compare_fused_all_reduce(fc_with_batchnorm, False)
class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps):
def _decorate_compare_fused_all_reduce(self, model, use_cuda):
self.compare_fuse_all_reduce_ops(
model,
use_cuda,
init_feed_dicta=init_data,
optimizer=self.optimizer,
fuse_all_optimizer_ops=True)
class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
cls.word_dict_len = 5147
batch_size = 64
reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100)
reader = paddle.batch(reader, batch_size=batch_size)()
cls.train_data = next(reader)
def get_data_from_feeder(self):
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=["words", "label"], place=place)
return feeder.feed(self.train_data)
def _decorate_compare_fused_all_reduce(self, model, use_cuda):
self.compare_fuse_all_reduce_ops(
model,
use_cuda,
get_data_from_feeder=self.get_data_from_feeder,
optimizer=self.optimizer)
def test_batchnorm_fc_with_fuse_op(self): def test_simple_bow_net_with_fuse_all_reduce(self):
self._compare_fuse_all_reduce_ops(fc_with_batchnorm, True) model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
self._compare_fuse_all_reduce_ops(fc_with_batchnorm, False) self._decorate_compare_fused_all_reduce(model, True)
self._decorate_compare_fused_all_reduce(model, False)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -11,30 +11,39 @@ ...@@ -11,30 +11,39 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from simple_nets import simple_fc_net, fc_with_batchnorm, init_data from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net
from fake_reader import fake_imdb_reader
from parallel_executor_test_base import TestParallelExecutorBase from parallel_executor_test_base import TestParallelExecutorBase
from functools import partial
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import unittest import unittest
import os import os
class TestFuseAdamOps(TestParallelExecutorBase): class TestFuseOptimizationOps(TestParallelExecutorBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
def _get_feed_dict(self):
img, label = init_data()
return {"image": img, "label": label}
def _compare_fused_optimizer_ops(self, def _compare_fused_optimizer_ops(self,
model, model,
use_cuda, use_cuda,
feed_dict=None,
get_data_from_feeder=None,
optimizer=fluid.optimizer.Adam): optimizer=fluid.optimizer.Adam):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
img, label = init_data()
feed_dict = {"image": img, "label": label}
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model, model,
feed_dict=feed_dict, feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_all_optimizer_ops=False, fuse_all_optimizer_ops=False,
memory_opt=False, # avoid the gradient's name changed in Python side. memory_opt=False, # avoid the gradient's name changed in Python side.
...@@ -42,6 +51,7 @@ class TestFuseAdamOps(TestParallelExecutorBase): ...@@ -42,6 +51,7 @@ class TestFuseAdamOps(TestParallelExecutorBase):
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model, model,
feed_dict=feed_dict, feed_dict=feed_dict,
get_data_from_feeder=get_data_from_feeder,
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_all_optimizer_ops=True, fuse_all_optimizer_ops=True,
memory_opt=False, # avoid the gradient's name changed in Python side. memory_opt=False, # avoid the gradient's name changed in Python side.
...@@ -52,48 +62,84 @@ class TestFuseAdamOps(TestParallelExecutorBase): ...@@ -52,48 +62,84 @@ class TestFuseAdamOps(TestParallelExecutorBase):
for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def _decorate_compare_fused_optimizer_ops(self, model, use_cuda, optimizer):
self._compare_fused_optimizer_ops(
model,
use_cuda,
feed_dict=self._get_feed_dict(),
optimizer=optimizer)
class TestFuseAdamOps(TestFuseOptimizationOps):
def optimizer(self, learning_rate=1e-4):
return fluid.optimizer.Adam(learning_rate=learning_rate)
def test_simple_fc_with_fuse_op(self): def test_simple_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(simple_fc_net, True) self._decorate_compare_fused_optimizer_ops(
self._compare_fused_optimizer_ops(simple_fc_net, False) simple_fc_net, True, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(
simple_fc_net, False, optimizer=self.optimizer)
def test_batchnorm_fc_with_fuse_op(self): def test_batchnorm_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(fc_with_batchnorm, True) self._decorate_compare_fused_optimizer_ops(
self._compare_fused_optimizer_ops(fc_with_batchnorm, False) fc_with_batchnorm, True, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(
fc_with_batchnorm, False, optimizer=self.optimizer)
class TestFuseSGDOps(TestFuseAdamOps): class TestFuseSGDOps(TestFuseAdamOps):
def sgd_optimizer(self, learning_rate=1e-3): def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate) return fluid.optimizer.SGD(learning_rate=learning_rate)
def test_simple_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(
simple_fc_net, True, optimizer=self.sgd_optimizer)
self._compare_fused_optimizer_ops(
simple_fc_net, False, optimizer=self.sgd_optimizer)
def test_batchnorm_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(
fc_with_batchnorm, True, optimizer=self.sgd_optimizer)
self._compare_fused_optimizer_ops(
fc_with_batchnorm, False, optimizer=self.sgd_optimizer)
class TestFuseMomentumOps(TestFuseAdamOps): class TestFuseMomentumOps(TestFuseAdamOps):
def momentum_optimizer(self, learning_rate=1e-3): def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum( return fluid.optimizer.Momentum(
learning_rate=learning_rate, momentum=0.1) learning_rate=learning_rate, momentum=0.1)
def test_simple_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(
simple_fc_net, True, optimizer=self.momentum_optimizer)
self._compare_fused_optimizer_ops(
simple_fc_net, False, optimizer=self.momentum_optimizer)
def test_batchnorm_fc_with_fuse_op(self): class TestSpareFuseAdamOps(TestFuseOptimizationOps):
self._compare_fused_optimizer_ops( @classmethod
fc_with_batchnorm, True, optimizer=self.momentum_optimizer) def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
cls.word_dict_len = 5147
batch_size = 64
reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100)
reader = paddle.batch(reader, batch_size=batch_size)()
cls.train_data = next(reader)
def _get_data_from_feeder(self):
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=["words", "label"], place=place)
return feeder.feed(self.train_data)
def _decorate_compare_fused_optimizer_ops(self, model, use_cuda, optimizer):
self._compare_fused_optimizer_ops( self._compare_fused_optimizer_ops(
fc_with_batchnorm, False, optimizer=self.momentum_optimizer) model,
use_cuda,
get_data_from_feeder=self._get_data_from_feeder,
optimizer=optimizer)
def optimizer(self, learning_rate=1e-4):
return fluid.optimizer.Adam(learning_rate=learning_rate)
def test_simple_bow_net_with_fuse_op(self):
model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
self._decorate_compare_fused_optimizer_ops(
model, True, optimizer=self.optimizer)
self._decorate_compare_fused_optimizer_ops(
model, False, optimizer=self.optimizer)
class TestSpareFuseSGDOps(TestSpareFuseAdamOps):
def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.SGD(learning_rate=learning_rate)
class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
def optimizer(self, learning_rate=1e-3):
return fluid.optimizer.Momentum(
learning_rate=learning_rate, momentum=0.1)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册