[Eager] Supported Eager Dygraph Forward & AutoGrad functions (#37323)

04e3b62f · Zhanlue Yang · GitHub · a0b895c0 · 04e3b62f · 04e3b62f
14 changed file
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
-add_subdirectory(accumulation)
 add_subdirectory(api)
+add_subdirectory(accumulation)
 add_subdirectory(tests)
-cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
 cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api)
+cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulation)
 cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta)
--- a/paddle/fluid/eager/api/CMakeLists.txt
+++ b/paddle/fluid/eager/api/CMakeLists.txt
 add_subdirectory(utils)
+add_subdirectory(generated)
+cc_library(eager_api SRCS all.cc DEPS global_utils eager_scale)
--- a/paddle/fluid/eager/api/all.h
+++ b/paddle/fluid/eager/api/all.h
@@ -14,4 +14,5 @@
 //
 #pragma once
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/scale.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
--- a/paddle/fluid/eager/api/generated/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/CMakeLists.txt
+add_subdirectory(eager_generated)
--- a/paddle/fluid/eager/api/generated/eager_generated/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/CMakeLists.txt
+add_subdirectory(backwards)
+add_subdirectory(forwards)
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+cc_library(scale_node SRCS scale_node.cc DEPS global_utils pten pten_api grad_node_info)
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/pten/api/all.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "glog/logging.h"
+namespace egr {
+template <typename DeviceContext>
+static void ScaleDeviceDispatch(const pten::DenseTensor& dense_tensor,
+                                const DeviceContext& dev_ctx, float scale,
+                                float bias, bool bias_after_scale,
+                                pten::DenseTensor* dense_out) {
+  switch (dense_tensor.dtype()) {
+    case pten::DataType::FLOAT64: {
+      pten::Scale<double>(dev_ctx, dense_tensor /* tensor */, scale /* scale */,
+                          bias /* bias */,
+                          bias_after_scale /* bias_after_scale */,
+                          dense_out /* out tensor */);
+      break;
+    }
+    case pten::DataType::FLOAT32: {
+      pten::Scale<float>(dev_ctx, dense_tensor /* tensor */, scale /* scale */,
+                         bias /* bias */,
+                         bias_after_scale /* bias_after_scale */,
+                         dense_out /* out tensor */);
+      break;
+    }
+    case pten::DataType::INT64: {
+      pten::Scale<int64_t>(dev_ctx, dense_tensor /* tensor */,
+                           scale /* scale */, bias /* bias */,
+                           bias_after_scale /* bias_after_scale */,
+                           dense_out /* out tensor */);
+      break;
+    }
+    case pten::DataType::INT32: {
+      pten::Scale<int32_t>(dev_ctx, dense_tensor /* tensor */,
+                           scale /* scale */, bias /* bias */,
+                           bias_after_scale /* bias_after_scale */,
+                           dense_out /* out tensor */);
+      break;
+    }
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Detected unsupported data type."
+          "Only Float64, Float32, Int64, Int32 are supported for now."));
+      break;
+    }
+  }
+}
+void ScaleAPI(const egr::EagerTensor& x, float scale, float bias,
+              bool bias_after_scale, egr::EagerTensor* out) {
+  // TODO(jiabin): Support multiple tensor here, Create DenseTensor is not a
+  // proper way to Demo it
+  // Run Forward Function
+  auto dense_tensor = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  // Init output tensor
+  auto tensor_meta = pten::DenseTensorMeta(
+      dense_tensor->dtype(), dense_tensor->dims(), dense_tensor->layout());
+  auto place = dense_tensor->place();
+  size_t bytes_size = paddle::framework::product(dense_tensor->dims()) *
+                      SizeOf(dense_tensor->dtype());
+  auto dense_out = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          paddle::memory::Alloc(place, bytes_size), 0),
+      std::move(tensor_meta));
+  // Handle Device Context
+  const paddle::platform::Place& expected_kernel_place =
+      Controller::Instance().GetExpectedPlace();
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  if (expected_kernel_place == paddle::platform::CPUPlace()) {
+    auto* dev_ctx = dynamic_cast<paddle::platform::CPUDeviceContext*>(
+        pool.Get(expected_kernel_place));
+    if (!dev_ctx) {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Cannot convert device_context to CPUDeviceContext."
+          "This indicates backend mismatch."
+          "Pleas double check your expected place"));
+    }
+    ScaleDeviceDispatch<paddle::platform::CPUDeviceContext>(
+        *dense_tensor.get(), *dev_ctx, scale, bias, bias_after_scale,
+        dense_out.get());
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
+    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(expected_kernel_place));
+    if (!dev_ctx) {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Cannot convert device_context to CUDADeviceContext."
+          "This indicates backend mismatch."
+          "Pleas double check your expected place"));
+    }
+    ScaleDeviceDispatch<paddle::platform::CUDADeviceContext>(
+        *dense_tensor.get(), *dev_ctx, scale, bias, bias_after_scale,
+        dense_out.get());
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Detected unsupported backend."
+        "Only CPU and CUDA Backend are supported for now."
+        "Please double check if your backend falls into the above two "
+        "categories."));
+  }
+  out->set_impl(dense_out);
+}
+void GradNodeScale::SetTensorWrappers_X(
+    const std::vector<egr::EagerTensor>& tensors) {
+  // Does nothing for scale
+}
+void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
+std::vector<std::vector<egr::EagerTensor>> GradNodeScale::operator()(
+    const std::vector<std::vector<egr::EagerTensor>>& grads) {
+  // 1. Check Output Size
+  PADDLE_ENFORCE(
+      ((grads.size() == 1) && (grads[0].size() == 1)),
+      paddle::platform::errors::Fatal(
+          "ScaleGradNode takes exactly 1 grad tensor."
+          "However received: %d",
+          "This indicates an issue with Eager Dygraph Backward logic",
+          grads.size()));
+  std::vector<std::vector<egr::EagerTensor>> outs;
+  // 2. Create needed out parttern
+  egr::EagerTensor out;
+  // Apply Gradient Hooks
+  if (GradientHooksRegistered()) {
+    // TODO(jiabin): Shall we apply hook slot by slot here or accept
+    // vector<vector<pten::tensor>> to apply all hooks?
+    std::vector<std::vector<egr::EagerTensor>> hooked_grads =
+        ApplyGradientHooks(grads);
+    ScaleAPI(/* slot by slot set */ hooked_grads[0][0], scale_, 0.0 /* bias */,
+             true /* bias_after_scale */, &out);
+  } else {
+    ScaleAPI(grads[0][0], scale_, 0.0 /* bias */, true /* bias_after_scale */,
+             &out);
+  }
+  // Apply Reduce Hooks
+  if (ReduceHooksRegistered()) {
+    ApplyReduceHooks();
+  }
+  return {{out}};
+}
+}  // namespace egr
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+/*
+    Each Operation has a specific GradNode inheritted from GradNodeBase
+    A specific GradNode defines
+    1. Input Tensors
+    2. overrides operator() to perform actual backward computations
+    TODO: Generate GradNode via auto-code-generation
+*/
+namespace egr {
+void ScaleAPI(const egr::EagerTensor& x, float scale, float bias,
+              bool bias_after_scale, egr::EagerTensor* out);
+class GradNodeScale : public GradNodeBase {
+ public:
+  // Constructor: configure fwd input tensors to grad node
+  GradNodeScale(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+  ~GradNodeScale() override = default;
+  // Functor: perform backward computations
+  virtual std::vector<std::vector<egr::EagerTensor>> operator()(
+      const std::vector<std::vector<egr::EagerTensor>>& grads) override;
+  void SetTensorWrappers_X(const std::vector<egr::EagerTensor>& tensors);
+  void SetAttributes_scale(float scale);
+  // Members: define fwd input tensors
+  // For Scale there is no fwd input tensor needed
+ private:
+  float scale_{1.0};
+};
+}  // namespace egr
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+cc_library(eager_scale SRCS scale.cc DEPS pten_api pten autograd_meta scale_node)
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * This File should be automatically generated by coding auto generator.
+ * All ops C++ autograd logic is defined here, in Python-C extension API
+ * system we try to avoid any autograd related code, and move them all to
+ * here.
+ *
+ * Currently, we just manually do some fwd autograd here. And we will replace
+ * them with auto code generator later.
+ * **/
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/scale.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/pten/api/all.h"
+#include "paddle/pten/include/core.h"
+namespace egr {
+egr::EagerTensor scale(const egr::EagerTensor& x, float scale, float bias,
+                       bool bias_after_scale, bool trace_backward) {
+  // 1. Run Forward
+  // 1.1 Create outputs
+  egr::EagerTensor out;
+  // 1.2 Need by original op, we assemble ins, outs, attrs here
+  // 1.3 Call forward C++ api
+  ScaleAPI(x, scale, bias, bias_after_scale, &out);
+  // 2. Build Backward Depends
+  // 2.1 Get AutogradMetas for all ins and outs
+  auto p_autograd_in = EagerUtils::unsafe_autograd_meta(x);
+  // NOTE: Call EagerUtils::multi_autograd_meta  when we have vector of outputs
+  auto p_autograd_out = EagerUtils::autograd_meta(&out);
+  // 2.2 Add GradNode
+  // 2.2.1 ComputeRequireGrad
+  // TODO(jiabin) : make this function accept different kinds of input
+  // TODO(zhanlve): which one is more efficient:
+  //                1. construct a vector of pointers
+  //                2. call "ComputeRequireGrad" multiple times
+  bool require_any_grad =
+      EagerUtils::ComputeRequireGrad(trace_backward, p_autograd_in);
+  if (require_any_grad) {
+    EagerUtils::PassStopGradient(false /*generate_grad*/, p_autograd_out);
+    // 2.2.2 Set OutRankInfo for outputs this needs to be as same as Edges's
+    // input_rank_
+    /** Note:
+    // 1. We provide EagerUtils::SetMultiOutRank(vector<AutogradMeta*>),
+    // since we have some of Operator has servel slot name with duplicate
+    outputs.
+    // 2. We call AutogradMeta's SetOutput Rank only when we have single output
+    with
+    // single slot name.
+    **/
+    p_autograd_out->SetSingleOutRankWithSlot(0, 0);
+    // Init GradNode
+    auto scale_node = std::make_shared<GradNodeScale>(/* fwd_in_slot_num */ 1,
+                                                      /* bwd_in_slot_num */ 1);
+    // Pass Attributes to GradNode
+    scale_node->SetAttributes_scale(scale);
+    // Set Next Edges
+    scale_node->AddEdges(*p_autograd_in, /*slot id*/ 0);
+    // Set TensorWrappers
+    scale_node->SetTensorWrappers_X({x});
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    scale_node->SetGradOutMeta(*p_autograd_in, /*slot id*/ 0);
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    scale_node->SetGradInMeta(*p_autograd_out, /*slot id*/ 0);
+    // Set History for output set current Grad Node for
+    EagerUtils::SetHistory(p_autograd_out, scale_node);
+  }
+  return out;
+}
+}  // namespace egr
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/eager/eager_tensor.h"
+namespace egr {
+egr::EagerTensor scale(const egr::EagerTensor& x, float scale, float bias,
+                       bool bias_after_scale, bool trace_backward);
+}  // namespace egr
--- a/paddle/fluid/eager/tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/CMakeLists.txt
-set(eager_deps pten pten_api utils tensor_utils global_utils pten_tensor autograd_meta grad_node_info grad_tensor_holder gradient_accumulation accumulation_node)
+set(eager_deps pten pten_api tensor_utils utils global_utils pten_tensor autograd_meta grad_node_info grad_tensor_holder gradient_accumulation accumulation_node)
+set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 add_subdirectory(data_structure_tests)
 add_subdirectory(task_tests)
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
 cc_test(test_egr_task_tensor_utils SRCS tensor_utils_test.cc DEPS ${eager_deps})
 cc_test(test_egr_task_eager_utils SRCS eager_utils_test.cc DEPS ${eager_deps})
+cc_test(test_egr_task_forward_autograd SRCS forward_autograd_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tests/test_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/tensor_meta.h"
+// TODO(jiabin): remove nolint here!!!
+using namespace egr;  // NOLINT
+namespace eager_test {
+TEST(Forward, SingleNode) {
+  // Prepare Device Contexts
+  InitEnv(paddle::platform::CPUPlace());
+  // Prepare Inputs
+  std::vector<egr::EagerTensor> target_tensors;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+  // Create Target Tensor
+  egr::EagerTensor t = CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(t));
+  egr::EagerTensor& tensor = target_tensors[0];
+  EagerUtils::autograd_meta(&tensor)->SetStopGradient(false);
+  // Run Forward
+  float scale = 2.0;
+  float bias = 3.0;
+  egr::EagerTensor out = egr::scale(
+      tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/);
+  // Examine Forward Output
+  CompareTensorWithValue<float>(out, 13.0);
+  // Examine GradNode
+  {
+    // 1. GradNode
+    AutogradMeta* meta = EagerUtils::autograd_meta(&out);
+    GradNodeBase* grad_node = meta->GradNode();
+    GradNodeScale* scale_node = dynamic_cast<GradNodeScale*>(grad_node);
+    CHECK_NOTNULL(scale_node);
+    CHECK_EQ(static_cast<int>(meta->OutRankInfo().first), 0);
+    CHECK_EQ(static_cast<int>(meta->OutRankInfo().second), 0);
+  }
+}
+/*
+ inp
+  |
+Node0
+  |
+Node1
+  |
+ out
+*/
+TEST(Forward, LinearNodes) {
+  InitEnv(paddle::platform::CPUPlace());
+  // Prepare Inputs
+  std::vector<egr::EagerTensor> target_tensors;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+  // Create Target Tensor
+  egr::EagerTensor t = CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(t));
+  egr::EagerTensor& tensor = target_tensors[0];
+  EagerUtils::autograd_meta(&tensor)->SetStopGradient(false);
+  // Run Forward Node 0
+  float scale0 = 2.0;
+  float bias0 = 3.0;
+  egr::EagerTensor out0 =
+      egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/,
+                 true /*trace_backward*/);
+  // Run Forward Node 1
+  float scale1 = 5.0;
+  float bias1 = 10.0;
+  egr::EagerTensor out1 = egr::scale(
+      out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
+  // Examine Forward Output 0
+  CompareTensorWithValue<float>(out0, 13.0);
+  // Examine Forward Output 1
+  CompareTensorWithValue<float>(out1, 75.0);
+  // Examine GradNode
+  {
+    // 1. GradNode
+    // Node 0
+    AutogradMeta* meta0 = EagerUtils::autograd_meta(&out0);
+    GradNodeBase* grad_node0 = meta0->GradNode();
+    GradNodeScale* scale_node0 = dynamic_cast<GradNodeScale*>(grad_node0);
+    CHECK_NOTNULL(scale_node0);
+    CHECK_EQ(static_cast<int>(meta0->OutRankInfo().first), 0);
+    CHECK_EQ(static_cast<int>(meta0->OutRankInfo().second), 0);
+    // Node 1
+    AutogradMeta* meta1 = EagerUtils::autograd_meta(&out1);
+    GradNodeBase* grad_node1 = meta1->GradNode();
+    GradNodeScale* scale_node1 = dynamic_cast<GradNodeScale*>(grad_node1);
+    CHECK_NOTNULL(scale_node1);
+    CHECK_EQ(static_cast<int>(meta1->OutRankInfo().first), 0);
+    CHECK_EQ(static_cast<int>(meta1->OutRankInfo().second), 0);
+    // 2. TensorWrapper: No TensorWrapper for ScaleNode
+    // 3. NextEdges: Node 1 -> Node 0
+    const std::vector<std::vector<Edge>>& node1_edges = grad_node1->GetEdges();
+    const auto& node1_edge = node1_edges[0];
+    CHECK_EQ(static_cast<int>(node1_edge[0].GetEdgeRankInfo().first), 0);
+    CHECK_EQ(static_cast<int>(node1_edge[0].GetEdgeRankInfo().second), 0);
+    CHECK_EQ(node1_edge[0].GetGradNode(), grad_node0);
+  }
+}
+/*
+       inp
+        |
+      Node0
+    ____|____
+    |       |
+  Node1   Node2
+    |       |
+   out1    out2
+*/
+TEST(Forward, BranchedNodes) {
+  InitEnv(paddle::platform::CPUPlace());
+  // Prepare Inputs
+  std::vector<egr::EagerTensor> target_tensors;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+  // Create Target Tensor
+  egr::EagerTensor t = CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(t));
+  egr::EagerTensor& tensor = target_tensors[0];
+  EagerUtils::autograd_meta(&tensor)->SetStopGradient(false);
+  // Run Forward Node 0
+  float scale0 = 2.0;
+  float bias0 = 3.0;
+  egr::EagerTensor out0 =
+      egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/,
+                 true /*trace_backward*/);
+  // Run Forward Node 1
+  float scale1 = 5.0;
+  float bias1 = 10.0;
+  egr::EagerTensor out1 = egr::scale(
+      out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
+  // Run Forward Node 2
+  float scale2 = 10.0;
+  float bias2 = 20.0;
+  egr::EagerTensor out2 = egr::scale(
+      out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
+  // Examine Forward Output 0
+  CompareTensorWithValue<float>(out0, 13.0);
+  // Examine Forward Output 1
+  CompareTensorWithValue<float>(out1, 75.0);
+  // Examine Forward Output 2
+  CompareTensorWithValue<float>(out2, 150.0);
+  // Examine GradNode
+  {
+    // 1. GradNode
+    // Node 0
+    AutogradMeta* meta0 = EagerUtils::autograd_meta(&out0);
+    GradNodeBase* grad_node0 = meta0->GradNode();
+    GradNodeScale* scale_node0 = dynamic_cast<GradNodeScale*>(grad_node0);
+    CHECK_NOTNULL(scale_node0);
+    CHECK_EQ(static_cast<int>(meta0->OutRankInfo().first), 0);
+    CHECK_EQ(static_cast<int>(meta0->OutRankInfo().second), 0);
+    // Node 1
+    AutogradMeta* meta1 = EagerUtils::autograd_meta(&out1);
+    GradNodeBase* grad_node1 = meta1->GradNode();
+    GradNodeScale* scale_node1 = dynamic_cast<GradNodeScale*>(grad_node1);
+    CHECK_NOTNULL(scale_node1);
+    CHECK_EQ(static_cast<int>(meta1->OutRankInfo().first), 0);
+    CHECK_EQ(static_cast<int>(meta1->OutRankInfo().second), 0);
+    // Node 2
+    AutogradMeta* meta2 = EagerUtils::autograd_meta(&out2);
+    GradNodeBase* grad_node2 = meta2->GradNode();
+    GradNodeScale* scale_node2 = dynamic_cast<GradNodeScale*>(grad_node2);
+    CHECK_NOTNULL(scale_node2);
+    CHECK_EQ(static_cast<int>(meta2->OutRankInfo().first), 0);
+    CHECK_EQ(static_cast<int>(meta2->OutRankInfo().second), 0);
+    // 2. TensorWrapper: No TensorWrapper for ScaleNode
+    // 3. NextEdges
+    // Node 1 -> Node 0
+    const std::vector<std::vector<Edge>>& node1_edges = grad_node1->GetEdges();
+    const Edge& node1_edge = node1_edges[0][0];
+    CHECK_EQ(static_cast<int>(node1_edge.GetEdgeRankInfo().first), 0);
+    CHECK_EQ(static_cast<int>(node1_edge.GetEdgeRankInfo().second), 0);
+    CHECK_EQ(node1_edge.GetGradNode(), grad_node0);
+    // Node 2 -> Node 0
+    const std::vector<std::vector<Edge>>& node2_edges = grad_node2->GetEdges();
+    const Edge& node2_edge = node2_edges[0][0];
+    CHECK_EQ(static_cast<int>(node2_edge.GetEdgeRankInfo().first), 0);
+    CHECK_EQ(static_cast<int>(node2_edge.GetEdgeRankInfo().second), 0);
+    CHECK_EQ(node2_edge.GetGradNode(), grad_node0);
+  }
+}
+}  // namespace eager_test