From 4f8594088d6d7ce0f20b45d2bcd4d0b9e74e8480 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 7 May 2019 07:21:40 -0500 Subject: [PATCH] Enhance inplace/mem-opt pass and enhance softmax_with_cross_entropy op inplace (#17225) * add use_cuda to inplace pass,test=develop * add test softmax_with_xe_inplace test,test=develop * fix potential inplace bug test=develop * add more skip vars in mem opt pass,test=develop * follow comment,test=develop * follow comments,move duplicate out arg check to program->graph,test=develop --- .../framework/details/inplace_op_pass.cc | 58 ++++---- .../framework/details/memory_optimize_pass.cc | 24 +-- .../record_skip_memory_opt_vars_pass.cc | 138 ++++++++++++++++-- paddle/fluid/framework/ir/graph.cc | 14 ++ .../softmax_with_cross_entropy_op.cc | 6 +- ...test_inplace_softmax_with_cross_entropy.py | 73 +++++++-- 6 files changed, 226 insertions(+), 87 deletions(-) diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index 9313d9958dd..5ed971eecd4 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -111,10 +111,14 @@ class InplacePass : public ir::Pass { // Check whether all `ops` is the preceding ops of `op` bool CheckOpDeps(ir::Node *op, const std::vector &ops) const; - // Find nodes whose name are equal to the given name + // Find nodes whose names are equal to the given name static std::unordered_set FindNodesByName( const std::string &name, const std::vector &nodes); + // Collect inputs and outputs of op_desc + static void CollectInputArgsOfOpDesc( + const OpDesc *op_desc, std::unordered_multiset *in_args); + // Get all versions vars named var_name std::vector *AllVersionVars(const std::string &var_name) const; @@ -201,37 +205,6 @@ void InplacePass::CollectSkipVars(ir::Graph *graph, for (const auto &var : mem_opt_whitelist) { skip_vars_.emplace(var); } - - // 2. track the nodes which used by parameter server. - // these node can not be inplaced, otherwise trainer - // pserver can not find each other's name. - // Also check the ops which has sub-block - auto update_skip_set = [&](ir::Node *node) { - for (auto &in : node->inputs) { - if (in->IsVar() && in->Var() != nullptr) { - skip_vars_.emplace(in->Name()); - } - } - for (auto &out : node->outputs) { - if (out->IsVar() && out->Var() != nullptr) { - skip_vars_.emplace(out->Name()); - } - } - }; - - for (auto *node : ops) { - if (!node->IsOp()) continue; - // avoid optimizing the variable used in sub-blocks - if (OpHasSubBlock(node->Op())) { - update_skip_set(node); - continue; - } - - auto node_name = node->Name(); - if (node_name == "send" || node_name == "recv" || node_name == "prefetch") { - update_skip_set(node); - } - } } void InplacePass::RenameInOut(ir::Node *op, ir::Node *in_var, @@ -301,6 +274,14 @@ std::unordered_set InplacePass::FindNodesByName( return ret; } +void InplacePass::CollectInputArgsOfOpDesc( + const OpDesc *op_desc, std::unordered_multiset *in_args) { + in_args->clear(); + for (auto &in_name : op_desc->InputArgumentNames()) { + in_args->insert(in_name); + } +} + void InplacePass::ApplyImpl(ir::Graph *graph) const { // Step 1: topo sort ops, collect skip vars auto ops = ir::TopologySortOperations(*graph); @@ -346,6 +327,11 @@ void InplacePass::ApplyImpl(ir::Graph *graph) const { } auto in_to_outs = infer_inplace(*op_desc, use_cuda); + if (in_to_outs.empty()) continue; + + std::unordered_multiset all_in_args; + CollectInputArgsOfOpDesc(op_desc, &all_in_args); + for (auto &pair : in_to_outs) { auto &in_param = pair.first; auto &out_param = pair.second; @@ -387,6 +373,14 @@ void InplacePass::ApplyImpl(ir::Graph *graph) const { continue; } + size_t in_arg_occur_times = all_in_args.count(in_arg); + if (in_arg_occur_times > 1) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " occurs " << in_arg_occur_times << " times in input of op " + << op_type; + continue; + } + auto in_nodes = FindNodesByName(in_arg, op_node->inputs); PADDLE_ENFORCE(!in_nodes.empty(), "Input(%s)=%s cannot be found in op %s", in_param, in_arg, op_type); diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index ef36f1038e2..56e5c072b46 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -207,28 +207,8 @@ void MemoryOptimizePass::CollectSkipVarsSet(ir::Graph* graph) const { // fill skip_set_ PADDLE_ENFORCE(graph->Has(details::kMemOptSkipVars)); auto& mem_opt_whitelist = graph->Get(kMemOptSkipVars); - for (const auto& var : mem_opt_whitelist) skip_set_.emplace(var); - - auto update_skip_set = [&](OpDesc* op_desc) { - auto inputs = op_desc->InputArgumentNames(); - auto outputs = op_desc->OutputArgumentNames(); - skip_set_.insert(inputs.begin(), inputs.end()); - skip_set_.insert(outputs.begin(), outputs.end()); - }; - - auto nodes = graph->Nodes(); - for (auto& op : nodes) { - if (!op->IsOp() || op->Op() == nullptr) continue; - auto* op_desc = op->Op(); - // NOTE(dzhwinter): - // current block can not reuse next level block vars. - if (OpHasSubBlock(op_desc)) update_skip_set(op_desc); - // NOTE(dzhwinter): - // distributed ops input/output name need to - // keep same bettwen trainer/pserver - if (op_desc->Type() == "send") update_skip_set(op_desc); - if (op_desc->Type() == "recv") update_skip_set(op_desc); - if (op_desc->Type() == "prefetch") update_skip_set(op_desc); + for (const auto& var : mem_opt_whitelist) { + skip_set_.emplace(var); } } diff --git a/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc b/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc index 7cb2544ebbf..d28c5afcf30 100644 --- a/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc +++ b/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc @@ -13,11 +13,14 @@ // limitations under the License. #include +#include +#include #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace framework { @@ -30,26 +33,129 @@ class RecordSkipMemoryOptVarsPass : public ir::Pass { graph->Set(kMemOptSkipVars, new MemOptSkipVars); auto& skip_vars = graph->Get(kMemOptSkipVars); + std::vector op_nodes; + for (auto& node : graph->Nodes()) { + PADDLE_ENFORCE_NOT_NULL(node, "The node should not be nullptr."); + if (node->IsOp() && node->Op()) { + op_nodes.emplace_back(node); + } + } + + // Insert kEmptyVarName to avoid optimizing empty variable + skip_vars.insert(framework::kEmptyVarName); + // NOTE(zcd): Insert OpRoleVars to SkipVarSet to prevent the vars are rename // in memory optimize pass. - InsertOpRoleVarsToSkipVarSet(graph, &skip_vars); + InsertOpRoleVarsToSkipVarSet(op_nodes, &skip_vars); + + InsertSkipMemOptOpInOutToSkipVarSet(op_nodes, &skip_vars); } - void InsertOpRoleVarsToSkipVarSet(const ir::Graph* graph, - MemOptSkipVars* skip_vars) const { - for (auto& node : graph->Nodes()) { - PADDLE_ENFORCE_NOT_NULL(node, "The node should not be nullptr."); - if (node->IsOp() && node->Op()) { - try { - auto op_role_vars = - boost::get>(node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - PADDLE_ENFORCE_EQ(op_role_vars.size() % 2, 0); - for (size_t i = 0; i < op_role_vars.size(); i += 2) { - auto& g_name = op_role_vars[i + 1]; - skip_vars->insert(g_name); - } - } catch (boost::bad_get e) { + private: + static void InsertOpRoleVarsToSkipVarSet(const std::vector& ops, + MemOptSkipVars* skip_vars) { + for (auto& node : ops) { + try { + auto op_role_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(op_role_vars.size() % 2, 0); + for (size_t i = 0; i < op_role_vars.size(); i += 2) { + auto& g_name = op_role_vars[i + 1]; + skip_vars->insert(g_name); + } + } catch (boost::bad_get& e) { + } + } + } + + static void UpdateSkipVarSet( + MemOptSkipVars* skip_vars, + const std::vector>& var_names) { + for (auto& var_name : var_names) { + skip_vars->insert(var_name.begin(), var_name.end()); + } + } + + static std::vector ToGradVarName( + const std::vector& names) { + std::vector ret; + ret.reserve(names.size()); + for (auto& name : names) { + if (name != framework::kEmptyVarName) { + ret.emplace_back(framework::GradVarName(name)); + } + } + return ret; + } + + static void InsertSkipMemOptOpInOutToSkipVarSet( + const std::vector& ops, MemOptSkipVars* skip_vars) { + static std::unordered_set kSkipMemOptOps{ + "send", "recv", "prefetch", "send_barrier", "fetch_barrier"}; + + for (auto& node : ops) { + auto* op_desc = node->Op(); + // Some ops (while, conditional_block, recurrent, etc.) have sub-blocks. + // These ops often use variables from its parent or forward blocks. + // Optimizing in/out of such ops would make these variables cannot + // be found when running sub-block ops. + if (OpHasSubBlock(op_desc)) { + UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(), + op_desc->OutputArgumentNames()}); + } + + // Skip ops that are related to parameter server. + // In distributed mode, trainers and parameter server use same + // variable names to track same variables. We cannot change the + // names of these variables, otherwise trainers or parameter + // server would not find them. + if (kSkipMemOptOps.count(op_desc->Type()) > 0) { + UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(), + op_desc->OutputArgumentNames()}); + } + + // FIXME(zjl): some ops use variables that are not from their + // inputs or outputs. We do not have a nice method to solve this + // issue yet. Currently, we should skip these variables when + // memory optimization is enabled. + auto op_type = op_desc->Type(); + if (op_type == "while_grad") { + // In while_grad, framework::GradVarName(Input("X")) is visited + // without being any in/out of while_grad. While_grad uses + // these variable to accumulate gradient of X across time steps. + UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("X"))}); + } else if (op_type == "conditional_block_grad") { + // In conditional_block_grad, framework::GradVarName(Input("Input", + // "Cond")) is visited without being any in/out of + // conditional_block_grad. Conditional_block_grad uses these + // variables to accumulate gradient of Input/Cond across time steps. + UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("Input")), + ToGradVarName(op_desc->Input("Cond"))}); + } else if (op_type == "recurrent" || op_type == "recurrent_grad") { + // Recurrent and recurrent_grad ops are implemented by a very trickly + // way. Attr("states", "ex_states") is visited without being any + // in/out of op. It is because these variables are from sub blocks, + // not main block. Adding these variables to input would make recurrent + // fail since "states" and "ex_states" cannot be found in main block. + // When memory optimization is enabled, "states", "ex_states" and their + // gradient should be skipped. + auto& ex_states = + boost::get>(op_desc->GetAttr("ex_states")); + auto& states = + boost::get>(op_desc->GetAttr("states")); + if (op_type == "recurrent") { + UpdateSkipVarSet(skip_vars, {ex_states, states}); + } else { + // In recurrent_grad, framework::GradVarName(Input("parameters", + // "input")) is visited without being any in/out of recurrent_grad. + // Recurrent_grad uses these variables to accumulate gradient of + // parameters/input across time steps. + UpdateSkipVarSet( + skip_vars, + {ToGradVarName(op_desc->Input("parameters")), + ToGradVarName(op_desc->Input("input")), ex_states, states, + ToGradVarName(ex_states), ToGradVarName(states)}); } } } diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 6a9340b870d..5eba32c4f3a 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -13,10 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include +#include #include +#include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" @@ -61,7 +66,16 @@ std::map> Graph::InitFromProgram( var->outputs.push_back(node); } // For output args, always create a new var. + std::unordered_set out_arg_set; for (auto &each_var_name : op->OutputArgumentNames()) { + if (each_var_name != kEmptyVarName) { + PADDLE_ENFORCE(out_arg_set.count(each_var_name) == 0, + "Program is wrong. %s occurs in output of %s several " + "times.", + each_var_name, op->Type()); + out_arg_set.insert(each_var_name); + } + ir::Node *var = nullptr; if (all_vars.count(each_var_name) != 0) { var = CreateVarNode(all_vars.at(each_var_name)); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 447b5b3a199..456f78d2022 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -261,11 +261,7 @@ class SoftmaxWithCrossEntropyInplaceInference public: std::unordered_map operator()( const framework::OpDesc& op_desc, bool use_cuda) const { - if (use_cuda && !boost::get(op_desc.GetAttr("soft_label"))) { - return {{"Logits", "Softmax"}}; - } else { - return {}; - } + return {{"Logits", "Softmax"}}; } }; diff --git a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py index a19626297a6..d4e514fa24c 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py @@ -21,25 +21,39 @@ import unittest class TestSoftmaxWithXe(unittest.TestCase): def setUp(self): + self.initParameter() self.m, self.n = np.random.random_integers( low=100, high=2000, size=[2]).astype('int64') - def softmax_with_xe(self, x, y, place, inplace=True): + def initParameter(self): + self.dtype = 'float32' + self.soft_label = False + + def softmax_with_xe(self, + x, + y, + place, + inplace=True, + numeric_stable_mode=True): m, n = x.shape with fluid.program_guard(fluid.Program(), fluid.Program()): with fluid.scope_guard(fluid.Scope()): x_d = fluid.layers.data( name='x', shape=[m, n], - dtype='float32', + dtype=self.dtype, append_batch_size=False) y_d = fluid.layers.data( name='y', - shape=[m, 1], - dtype='int64', + shape=[m, 1] if not self.soft_label else [m, n], + dtype='int64' if not self.soft_label else self.dtype, append_batch_size=False) z_d, s_d = fluid.layers.softmax_with_cross_entropy( - x_d, y_d, return_softmax=True) + x_d, + y_d, + soft_label=self.soft_label, + return_softmax=True, + numeric_stable_mode=numeric_stable_mode) exe = fluid.Executor(place) @@ -51,7 +65,7 @@ class TestSoftmaxWithXe(unittest.TestCase): )).with_data_parallel( build_strategy=build_strategy, places=place) - if inplace and isinstance(place, fluid.CUDAPlace): + if inplace: fetch_list = [z_d.name, x_d.name] else: fetch_list = [z_d.name, s_d.name] @@ -63,16 +77,33 @@ class TestSoftmaxWithXe(unittest.TestCase): return z, s def main_with_place(self, place): - x = np.random.random(size=[self.m, self.n]).astype('float32') + x = np.random.random(size=[self.m, self.n]).astype(self.dtype) x_range = [(-30, 30), (10, 20), (-1, 1), (2, 3), (0, 0.3), (-200, -100)] for a, b in x_range: - x = ((b - a) * x + a).astype('float32') - y = np.random.random_integers( - size=[self.m, 1], low=0, high=self.n - 1).astype('int64') - z1, s1 = self.softmax_with_xe(x, y, place, False) - z2, s2 = self.softmax_with_xe(x, y, place, True) + x = ((b - a) * x + a).astype(self.dtype) + if not self.soft_label: + y = np.random.random_integers( + size=[self.m, 1], low=0, high=self.n - 1).astype('int64') + else: + y = np.random.random(size=[self.m, self.n]).astype(self.dtype) + norm_y = np.broadcast_to( + np.reshape( + np.sum(y, axis=1), [-1, 1]), y.shape) + y = y / norm_y + + z1, s1 = self.softmax_with_xe( + x, y, place, inplace=False, numeric_stable_mode=False) + z2, s2 = self.softmax_with_xe( + x, y, place, inplace=True, numeric_stable_mode=False) + + self.assertTrue((z1 == z2).all()) + self.assertTrue((s1 == s2).all()) + z1, s1 = self.softmax_with_xe( + x, y, place, inplace=False, numeric_stable_mode=True) + z2, s2 = self.softmax_with_xe( + x, y, place, inplace=True, numeric_stable_mode=True) self.assertTrue((z1 == z2).all()) self.assertTrue((s1 == s2).all()) @@ -82,5 +113,23 @@ class TestSoftmaxWithXe(unittest.TestCase): self.main_with_place(fluid.CUDAPlace(0)) +class TestSoftmaxWithXe1(TestSoftmaxWithXe): + def initParameter(self): + self.dtype = 'float32' + self.soft_label = True + + +class TestSoftmaxWithXe2(TestSoftmaxWithXe): + def initParameter(self): + self.dtype = 'float64' + self.soft_label = False + + +class TestSoftmaxWithXe3(TestSoftmaxWithXe): + def initParameter(self): + self.dtype = 'float64' + self.soft_label = True + + if __name__ == '__main__': unittest.main() -- GitLab