[NGraph] some ngraph updates to enable bert (#17739)

* delay infershape test=develop * fall back subblock to paddle test=develop * fix edge cases test=develop * remove output duplicates test=develop * handle reshape2_grad infershape test=develop

[NGraph] some ngraph updates to enable bert (#17739)
* delay infershape test=develop * fall back subblock to paddle test=develop * fix edge cases test=develop * remove output duplicates test=develop * handle reshape2_grad infershape test=develop
a4c528a3 · baojun · tensor-tang · 3d3f5506 · a4c528a3 · a4c528a3
7 changed file
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -334,7 +334,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
 #ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph) {
+  if (FLAGS_use_ngraph && ctx->block_id_ == 0) {
    paddle::operators::NgraphEngine::FuseNgraphOps(
        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
  }

--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -36,14 +36,14 @@ bool NgraphBridge::isRegister(const std::string& str) {

 bool NgraphBridge::isSupported(
    const std::unique_ptr<framework::OperatorBase>& op) {
-  static std::unordered_set<std::string> skip_op_list{"reshape", "reshape2",
-                                                      "lookup_table"};
+  static std::unordered_set<std::string> skip_op_list{
+      "reshape", "reshape2", "lookup_table", "lookup_table_grad"};
  bool result = true;
  auto& op_type = op->Type();
  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
  if (!isRegister(op_type)) {
    if (skip_op_list.count(op_type)) {
-      if (op_type == "lookup_table") {
+      if (op_type == "lookup_table" || op_type == "lookup_table_grad") {
        if (op_attrs.Get<bool>("is_sparse") ||
            (op_attrs.Get<int64_t>("padding_idx") != kNoPadding)) {
          result = false;

--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -38,6 +38,10 @@ namespace operators {

 static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
  ngraph::Shape sp;
+  if (dims.size() == 1 && dims[0] == 0) {
+    sp.emplace_back(0);
+    return sp;
+  }
  for (int i = 0; i < dims.size(); ++i) {
    int k = dims[i];
    k = k == 0 ? 1 : k;
@@ -417,6 +421,15 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
      }
    }
  }
+  // remove output duplicates
+  std::unordered_set<std::string> var_out_set;
+  for (int i = static_cast<int>(var_out_.size()) - 1; i >= 0; --i) {
+    std::string var_name = var_out_.at(i);
+    if (var_out_set.count(var_name)) {
+      var_out_.erase(var_out_.begin() + i);
+    }
+    var_out_set.insert(var_name);
+  }
 }

 void NgraphEngine::GetNgInputShape() {
@@ -458,16 +471,8 @@ void NgraphEngine::BuildNgNodes() {
  }
 }

-void NgraphEngine::RunInferShape() {
-  for (auto& op : fused_ops_) {
-    framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
-    op->RuntimeInferShape(scope_, place_, ctx);
-  }
-}
-
 void NgraphEngine::BuildNgFunction(const framework::ExecutionContext& ctx) {
  Prepare(ctx);
-  RunInferShape();
  GetNgInputShape();
  BuildNgNodes();
  ngraph_function_ = nullptr;
@@ -626,6 +631,21 @@ void NgraphEngine::Run(const framework::Scope& scope,
    }
  }

+  for (auto& op : fused_ops_) {
+    framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+    if (op->Type() == "reshape2_grad") {
+      auto xshape_name = op->Inputs().at("XShape").at(0);
+      auto* xshape_var = scope_.FindVar(xshape_name);
+      auto* xshape_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*xshape_var);
+      auto& xshape_ddim = xshape_tensor->dims();
+      auto xgrad_name = op->Outputs().at(framework::GradVarName("X")).at(0);
+      auto* xgrad_var = scope_.FindVar(xgrad_name);
+      xgrad_var->GetMutable<framework::LoDTensor>()->Resize(xshape_ddim);
+    } else {
+      op->RuntimeInferShape(scope_, place_, ctx);
+    }
+  }
+
  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out = {};
  for (size_t i = 0; i < p_var_out->size(); ++i) {
    auto vo = p_var_out->at(i);

--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -109,8 +109,6 @@ class NgraphEngine {
  void GetNgInputShape();
  // Call ngraph bridge to map ops
  void BuildNgNodes();
-  // run paddle RuntimeInferShape to get the tensor shape
-  void RunInferShape();
  // build ngraph function call
  void BuildNgFunction(const framework::ExecutionContext& ctx);
  // Check cache for ngraph function or otherwise build the function

--- a/paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #include "ngraph/ngraph.hpp"
@@ -42,11 +44,11 @@ ngraph::NodeVector ElementwiseBinaryNodePrepare(
  if (lhs_shape == rhs_shape) {
    return ngraph::NodeVector{lhs, rhs};
  }
+  axis = (rhs_shape.size() == 0) ? lhs_shape.size() - 1 : axis;
  axis = (axis == -1 ? lhs_shape.size() - rhs_shape.size() : axis);
  PADDLE_ENFORCE(axis >= 0 && axis < (int)(lhs_shape.size()),
                 "Axis should be in range [0, lhs_shape)");
  paddle::platform::TrimTrailingSingularDims(&rhs_shape);
-  axis = (rhs_shape.size() == 0) ? lhs_shape.size() : axis;

  int pre, n, post;
  paddle::platform::GetMidDims(lhs_shape, rhs_shape, axis, &pre, &n, &post);

--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -35,6 +35,7 @@ static void BuildMulNode(
  int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
+  int y_rank = y->get_shape().size();

  auto x_reshape = x;
  auto y_reshape = y;
@@ -52,10 +53,14 @@ static void BuildMulNode(
  std::shared_ptr<ngraph::Node> out =
      std::make_shared<ngraph::op::Dot>(x_reshape, y_reshape);

-  auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
-  if (dummy_out && dummy_out->get_shape() != out->get_shape()) {
-    out = paddle::platform::NgReshaper(out, dummy_out->get_shape());
+  ngraph::Shape out_shape;
+  for (int i = 0; i < x_num_col_dims; ++i) {
+    out_shape.push_back(x->get_shape()[i]);
  }
+  for (int i = y_num_col_dims; i < y_rank; ++i) {
+    out_shape.push_back(y->get_shape()[i]);
+  }
+  out = paddle::platform::NgReshaper(out, out_shape);
  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
 }


--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
@@ -60,17 +60,20 @@ void BuildPool2dNode(
  ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
                             static_cast<size_t>(strides.at(1))};

-  auto ComputeCeiledOutput = [](size_t in, size_t k, size_t p, size_t s) {
+  auto ComputeFlooredOutput = [](size_t in, size_t k, size_t p, size_t s) {
    return (in - k + 2 * p) / s + 1;
  };
+  auto ComputeCeiledOutput = [](size_t in, size_t k, size_t p, size_t s) {
+    return ceil(static_cast<float>(in - k + 2 * p) / s) + 1;
+  };

  if (op_attrs.Get<bool>("ceil_mode")) {
-    auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
-    auto dummpy_shape = dummy_out->get_shape();
    for (size_t i = 0; i < ng_padding_above.size(); ++i) {
-      auto desired_size = ComputeCeiledOutput(x_shape[i + 2], ksize[i],
-                                              paddings[i], strides[i]);
-      if (desired_size != dummpy_shape[i + 2]) {
+      auto ceiled_size = ComputeCeiledOutput(x_shape[i + 2], ksize[i],
+                                             paddings[i], strides[i]);
+      auto floored_size = ComputeFlooredOutput(x_shape[i + 2], ksize[i],
+                                               paddings[i], strides[i]);
+      if (ceiled_size != floored_size) {
        ng_padding_above[i] += strides[i];
      }
    }
@@ -96,6 +99,10 @@ void BuildPool2dNode(
      pool2d =
          std::make_shared<ngraph::op::AvgPool>(x, ng_ksize_shape, ng_strides);
    } else {
+      if ((ng_padding_below[0] == 0) && (ng_padding_below[1] == 0) &&
+          (ng_padding_above[0] == 0) && (ng_padding_above[1] == 0)) {
+        padding_exclusive = false;
+      }
      pool2d = std::make_shared<ngraph::op::AvgPool>(
          x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above,
          !padding_exclusive);
@@ -163,6 +170,10 @@ void BuildPool2dGradNode(
          x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
          ng_padding_above, !padding_exclusive);
    } else {
+      if ((ng_padding_below[0] == 0) && (ng_padding_below[1] == 0) &&
+          (ng_padding_above[0] == 0) && (ng_padding_above[1] == 0)) {
+        padding_exclusive = false;
+      }
      pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
          x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
          ng_padding_above, !padding_exclusive);