Enhance inplace/mem-opt pass and enhance softmax_with_cross_entropy op inplace (#17225)

* add use_cuda to inplace pass,test=develop * add test softmax_with_xe_inplace test,test=develop * fix potential inplace bug test=develop * add more skip vars in mem opt pass,test=develop * follow comment,test=develop * follow comments,move duplicate out arg check to program->graph,test=develop

Enhance inplace/mem-opt pass and enhance softmax_with_cross_entropy op inplace (#17225)
* add use_cuda to inplace pass,test=develop * add test softmax_with_xe_inplace test,test=develop * fix potential inplace bug test=develop * add more skip vars in mem opt pass,test=develop * follow comment,test=develop * follow comments,move duplicate out arg check to program->graph,test=develop
4f859408 · Zeng Jinle · GitHub · 8b62f537 · 4f859408 · 4f859408
6 changed file
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -111,10 +111,14 @@ class InplacePass : public ir::Pass {
  // Check whether all `ops` is the preceding ops of `op`
  bool CheckOpDeps(ir::Node *op, const std::vector<ir::Node *> &ops) const;

-  // Find nodes whose name are equal to the given name
+  // Find nodes whose names are equal to the given name
  static std::unordered_set<ir::Node *> FindNodesByName(
      const std::string &name, const std::vector<ir::Node *> &nodes);

+  // Collect inputs and outputs of op_desc
+  static void CollectInputArgsOfOpDesc(
+      const OpDesc *op_desc, std::unordered_multiset<std::string> *in_args);
+
  // Get all versions vars named var_name
  std::vector<ir::Node *> *AllVersionVars(const std::string &var_name) const;

@@ -201,37 +205,6 @@ void InplacePass::CollectSkipVars(ir::Graph *graph,
  for (const auto &var : mem_opt_whitelist) {
    skip_vars_.emplace(var);
  }
-
-  // 2. track the nodes which used by parameter server.
-  // these node can not be inplaced, otherwise trainer
-  // pserver can not find each other's name.
-  // Also check the ops which has sub-block
-  auto update_skip_set = [&](ir::Node *node) {
-    for (auto &in : node->inputs) {
-      if (in->IsVar() && in->Var() != nullptr) {
-        skip_vars_.emplace(in->Name());
-      }
-    }
-    for (auto &out : node->outputs) {
-      if (out->IsVar() && out->Var() != nullptr) {
-        skip_vars_.emplace(out->Name());
-      }
-    }
-  };
-
-  for (auto *node : ops) {
-    if (!node->IsOp()) continue;
-    // avoid optimizing the variable used in sub-blocks
-    if (OpHasSubBlock(node->Op())) {
-      update_skip_set(node);
-      continue;
-    }
-
-    auto node_name = node->Name();
-    if (node_name == "send" || node_name == "recv" || node_name == "prefetch") {
-      update_skip_set(node);
-    }
-  }
 }

 void InplacePass::RenameInOut(ir::Node *op, ir::Node *in_var,
@@ -301,6 +274,14 @@ std::unordered_set<ir::Node *> InplacePass::FindNodesByName(
  return ret;
 }

+void InplacePass::CollectInputArgsOfOpDesc(
+    const OpDesc *op_desc, std::unordered_multiset<std::string> *in_args) {
+  in_args->clear();
+  for (auto &in_name : op_desc->InputArgumentNames()) {
+    in_args->insert(in_name);
+  }
+}
+
 void InplacePass::ApplyImpl(ir::Graph *graph) const {
  // Step 1: topo sort ops, collect skip vars
  auto ops = ir::TopologySortOperations(*graph);
@@ -346,6 +327,11 @@ void InplacePass::ApplyImpl(ir::Graph *graph) const {
    }

    auto in_to_outs = infer_inplace(*op_desc, use_cuda);
+    if (in_to_outs.empty()) continue;
+
+    std::unordered_multiset<std::string> all_in_args;
+    CollectInputArgsOfOpDesc(op_desc, &all_in_args);
+
    for (auto &pair : in_to_outs) {
      auto &in_param = pair.first;
      auto &out_param = pair.second;
@@ -387,6 +373,14 @@ void InplacePass::ApplyImpl(ir::Graph *graph) const {
        continue;
      }

+      size_t in_arg_occur_times = all_in_args.count(in_arg);
+      if (in_arg_occur_times > 1) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " occurs " << in_arg_occur_times << " times in input of op "
+                << op_type;
+        continue;
+      }
+
      auto in_nodes = FindNodesByName(in_arg, op_node->inputs);
      PADDLE_ENFORCE(!in_nodes.empty(), "Input(%s)=%s cannot be found in op %s",
                     in_param, in_arg, op_type);

--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -207,28 +207,8 @@ void MemoryOptimizePass::CollectSkipVarsSet(ir::Graph* graph) const {
  // fill skip_set_
  PADDLE_ENFORCE(graph->Has(details::kMemOptSkipVars));
  auto& mem_opt_whitelist = graph->Get<MemOptSkipVars>(kMemOptSkipVars);
-  for (const auto& var : mem_opt_whitelist) skip_set_.emplace(var);
-
-  auto update_skip_set = [&](OpDesc* op_desc) {
-    auto inputs = op_desc->InputArgumentNames();
-    auto outputs = op_desc->OutputArgumentNames();
-    skip_set_.insert(inputs.begin(), inputs.end());
-    skip_set_.insert(outputs.begin(), outputs.end());
-  };
-
-  auto nodes = graph->Nodes();
-  for (auto& op : nodes) {
-    if (!op->IsOp() || op->Op() == nullptr) continue;
-    auto* op_desc = op->Op();
-    // NOTE(dzhwinter):
-    // current block can not reuse next level block vars.
-    if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
-    // NOTE(dzhwinter):
-    // distributed ops input/output name need to
-    // keep same bettwen trainer/pserver
-    if (op_desc->Type() == "send") update_skip_set(op_desc);
-    if (op_desc->Type() == "recv") update_skip_set(op_desc);
-    if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
+  for (const auto& var : mem_opt_whitelist) {
+    skip_set_.emplace(var);
  }
 }


--- a/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc
+++ b/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc
@@ -13,11 +13,14 @@
 // limitations under the License.

 #include <string>
+#include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"

 namespace paddle {
 namespace framework {
@@ -30,26 +33,129 @@ class RecordSkipMemoryOptVarsPass : public ir::Pass {
    graph->Set(kMemOptSkipVars, new MemOptSkipVars);
    auto& skip_vars = graph->Get<MemOptSkipVars>(kMemOptSkipVars);

+    std::vector<ir::Node*> op_nodes;
+    for (auto& node : graph->Nodes()) {
+      PADDLE_ENFORCE_NOT_NULL(node, "The node should not be nullptr.");
+      if (node->IsOp() && node->Op()) {
+        op_nodes.emplace_back(node);
+      }
+    }
+
+    // Insert kEmptyVarName to avoid optimizing empty variable
+    skip_vars.insert(framework::kEmptyVarName);
+
    // NOTE(zcd): Insert OpRoleVars to SkipVarSet to prevent the vars are rename
    // in memory optimize pass.
-    InsertOpRoleVarsToSkipVarSet(graph, &skip_vars);
+    InsertOpRoleVarsToSkipVarSet(op_nodes, &skip_vars);
+
+    InsertSkipMemOptOpInOutToSkipVarSet(op_nodes, &skip_vars);
  }

-  void InsertOpRoleVarsToSkipVarSet(const ir::Graph* graph,
-                                    MemOptSkipVars* skip_vars) const {
-    for (auto& node : graph->Nodes()) {
-      PADDLE_ENFORCE_NOT_NULL(node, "The node should not be nullptr.");
-      if (node->IsOp() && node->Op()) {
-        try {
-          auto op_role_vars =
-              boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-                  OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-          PADDLE_ENFORCE_EQ(op_role_vars.size() % 2, 0);
-          for (size_t i = 0; i < op_role_vars.size(); i += 2) {
-            auto& g_name = op_role_vars[i + 1];
-            skip_vars->insert(g_name);
-          }
-        } catch (boost::bad_get e) {
+ private:
+  static void InsertOpRoleVarsToSkipVarSet(const std::vector<ir::Node*>& ops,
+                                           MemOptSkipVars* skip_vars) {
+    for (auto& node : ops) {
+      try {
+        auto op_role_vars =
+            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+        PADDLE_ENFORCE_EQ(op_role_vars.size() % 2, 0);
+        for (size_t i = 0; i < op_role_vars.size(); i += 2) {
+          auto& g_name = op_role_vars[i + 1];
+          skip_vars->insert(g_name);
+        }
+      } catch (boost::bad_get& e) {
+      }
+    }
+  }
+
+  static void UpdateSkipVarSet(
+      MemOptSkipVars* skip_vars,
+      const std::vector<std::vector<std::string>>& var_names) {
+    for (auto& var_name : var_names) {
+      skip_vars->insert(var_name.begin(), var_name.end());
+    }
+  }
+
+  static std::vector<std::string> ToGradVarName(
+      const std::vector<std::string>& names) {
+    std::vector<std::string> ret;
+    ret.reserve(names.size());
+    for (auto& name : names) {
+      if (name != framework::kEmptyVarName) {
+        ret.emplace_back(framework::GradVarName(name));
+      }
+    }
+    return ret;
+  }
+
+  static void InsertSkipMemOptOpInOutToSkipVarSet(
+      const std::vector<ir::Node*>& ops, MemOptSkipVars* skip_vars) {
+    static std::unordered_set<std::string> kSkipMemOptOps{
+        "send", "recv", "prefetch", "send_barrier", "fetch_barrier"};
+
+    for (auto& node : ops) {
+      auto* op_desc = node->Op();
+      // Some ops (while, conditional_block, recurrent, etc.) have sub-blocks.
+      // These ops often use variables from its parent or forward blocks.
+      // Optimizing in/out of such ops would make these variables cannot
+      // be found when running sub-block ops.
+      if (OpHasSubBlock(op_desc)) {
+        UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(),
+                                     op_desc->OutputArgumentNames()});
+      }
+
+      // Skip ops that are related to parameter server.
+      // In distributed mode, trainers and parameter server use same
+      // variable names to track same variables. We cannot change the
+      // names of these variables, otherwise trainers or parameter
+      // server would not find them.
+      if (kSkipMemOptOps.count(op_desc->Type()) > 0) {
+        UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(),
+                                     op_desc->OutputArgumentNames()});
+      }
+
+      // FIXME(zjl): some ops use variables that are not from their
+      // inputs or outputs. We do not have a nice method to solve this
+      // issue yet. Currently, we should skip these variables when
+      // memory optimization is enabled.
+      auto op_type = op_desc->Type();
+      if (op_type == "while_grad") {
+        // In while_grad, framework::GradVarName(Input("X")) is visited
+        // without being any in/out of while_grad. While_grad uses
+        // these variable to accumulate gradient of X across time steps.
+        UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("X"))});
+      } else if (op_type == "conditional_block_grad") {
+        // In conditional_block_grad, framework::GradVarName(Input("Input",
+        // "Cond")) is visited without being any in/out of
+        // conditional_block_grad. Conditional_block_grad uses these
+        // variables to accumulate gradient of Input/Cond across time steps.
+        UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("Input")),
+                                     ToGradVarName(op_desc->Input("Cond"))});
+      } else if (op_type == "recurrent" || op_type == "recurrent_grad") {
+        // Recurrent and recurrent_grad ops are implemented by a very trickly
+        // way. Attr("states", "ex_states") is visited without being any
+        // in/out of op. It is because these variables are from sub blocks,
+        // not main block. Adding these variables to input would make recurrent
+        // fail since "states" and "ex_states" cannot be found in main block.
+        // When memory optimization is enabled, "states", "ex_states" and their
+        // gradient should be skipped.
+        auto& ex_states =
+            boost::get<std::vector<std::string>>(op_desc->GetAttr("ex_states"));
+        auto& states =
+            boost::get<std::vector<std::string>>(op_desc->GetAttr("states"));
+        if (op_type == "recurrent") {
+          UpdateSkipVarSet(skip_vars, {ex_states, states});
+        } else {
+          // In recurrent_grad, framework::GradVarName(Input("parameters",
+          // "input")) is visited without being any in/out of recurrent_grad.
+          // Recurrent_grad uses these variables to accumulate gradient of
+          // parameters/input across time steps.
+          UpdateSkipVarSet(
+              skip_vars,
+              {ToGradVarName(op_desc->Input("parameters")),
+               ToGradVarName(op_desc->Input("input")), ex_states, states,
+               ToGradVarName(ex_states), ToGradVarName(states)});
        }
      }
    }

--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <algorithm>
+#include <memory>
+#include <string>
 #include <unordered_map>
+#include <unordered_set>
+#include <vector>

 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"

@@ -61,7 +66,16 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
      var->outputs.push_back(node);
    }
    // For output args, always create a new var.
+    std::unordered_set<std::string> out_arg_set;
    for (auto &each_var_name : op->OutputArgumentNames()) {
+      if (each_var_name != kEmptyVarName) {
+        PADDLE_ENFORCE(out_arg_set.count(each_var_name) == 0,
+                       "Program is wrong. %s occurs in output of %s several "
+                       "times.",
+                       each_var_name, op->Type());
+        out_arg_set.insert(each_var_name);
+      }
+
      ir::Node *var = nullptr;
      if (all_vars.count(each_var_name) != 0) {
        var = CreateVarNode(all_vars.at(each_var_name));

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -261,11 +261,7 @@ class SoftmaxWithCrossEntropyInplaceInference
 public:
  std::unordered_map<std::string, std::string> operator()(
      const framework::OpDesc& op_desc, bool use_cuda) const {
-    if (use_cuda && !boost::get<bool>(op_desc.GetAttr("soft_label"))) {
-      return {{"Logits", "Softmax"}};
-    } else {
-      return {};
-    }
+    return {{"Logits", "Softmax"}};
  }
 };


--- a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
@@ -21,25 +21,39 @@ import unittest

 class TestSoftmaxWithXe(unittest.TestCase):
    def setUp(self):
+        self.initParameter()
        self.m, self.n = np.random.random_integers(
            low=100, high=2000, size=[2]).astype('int64')

-    def softmax_with_xe(self, x, y, place, inplace=True):
+    def initParameter(self):
+        self.dtype = 'float32'
+        self.soft_label = False
+
+    def softmax_with_xe(self,
+                        x,
+                        y,
+                        place,
+                        inplace=True,
+                        numeric_stable_mode=True):
        m, n = x.shape
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            with fluid.scope_guard(fluid.Scope()):
                x_d = fluid.layers.data(
                    name='x',
                    shape=[m, n],
-                    dtype='float32',
+                    dtype=self.dtype,
                    append_batch_size=False)
                y_d = fluid.layers.data(
                    name='y',
-                    shape=[m, 1],
-                    dtype='int64',
+                    shape=[m, 1] if not self.soft_label else [m, n],
+                    dtype='int64' if not self.soft_label else self.dtype,
                    append_batch_size=False)
                z_d, s_d = fluid.layers.softmax_with_cross_entropy(
-                    x_d, y_d, return_softmax=True)
+                    x_d,
+                    y_d,
+                    soft_label=self.soft_label,
+                    return_softmax=True,
+                    numeric_stable_mode=numeric_stable_mode)

                exe = fluid.Executor(place)

@@ -51,7 +65,7 @@ class TestSoftmaxWithXe(unittest.TestCase):
                )).with_data_parallel(
                    build_strategy=build_strategy, places=place)

-                if inplace and isinstance(place, fluid.CUDAPlace):
+                if inplace:
                    fetch_list = [z_d.name, x_d.name]
                else:
                    fetch_list = [z_d.name, s_d.name]
@@ -63,16 +77,33 @@ class TestSoftmaxWithXe(unittest.TestCase):
                return z, s

    def main_with_place(self, place):
-        x = np.random.random(size=[self.m, self.n]).astype('float32')
+        x = np.random.random(size=[self.m, self.n]).astype(self.dtype)
        x_range = [(-30, 30), (10, 20), (-1, 1), (2, 3), (0, 0.3), (-200, -100)]

        for a, b in x_range:
-            x = ((b - a) * x + a).astype('float32')
-            y = np.random.random_integers(
-                size=[self.m, 1], low=0, high=self.n - 1).astype('int64')
-            z1, s1 = self.softmax_with_xe(x, y, place, False)
-            z2, s2 = self.softmax_with_xe(x, y, place, True)
+            x = ((b - a) * x + a).astype(self.dtype)
+            if not self.soft_label:
+                y = np.random.random_integers(
+                    size=[self.m, 1], low=0, high=self.n - 1).astype('int64')
+            else:
+                y = np.random.random(size=[self.m, self.n]).astype(self.dtype)
+                norm_y = np.broadcast_to(
+                    np.reshape(
+                        np.sum(y, axis=1), [-1, 1]), y.shape)
+                y = y / norm_y
+
+            z1, s1 = self.softmax_with_xe(
+                x, y, place, inplace=False, numeric_stable_mode=False)
+            z2, s2 = self.softmax_with_xe(
+                x, y, place, inplace=True, numeric_stable_mode=False)
+
+            self.assertTrue((z1 == z2).all())
+            self.assertTrue((s1 == s2).all())

+            z1, s1 = self.softmax_with_xe(
+                x, y, place, inplace=False, numeric_stable_mode=True)
+            z2, s2 = self.softmax_with_xe(
+                x, y, place, inplace=True, numeric_stable_mode=True)
            self.assertTrue((z1 == z2).all())
            self.assertTrue((s1 == s2).all())

@@ -82,5 +113,23 @@ class TestSoftmaxWithXe(unittest.TestCase):
            self.main_with_place(fluid.CUDAPlace(0))


+class TestSoftmaxWithXe1(TestSoftmaxWithXe):
+    def initParameter(self):
+        self.dtype = 'float32'
+        self.soft_label = True
+
+
+class TestSoftmaxWithXe2(TestSoftmaxWithXe):
+    def initParameter(self):
+        self.dtype = 'float64'
+        self.soft_label = False
+
+
+class TestSoftmaxWithXe3(TestSoftmaxWithXe):
+    def initParameter(self):
+        self.dtype = 'float64'
+        self.soft_label = True
+
+
 if __name__ == '__main__':
    unittest.main()