add copy_cross_scope (#32432)

5943ff7b · Baibaifan · GitHub · b0556764 · 5943ff7b · 5943ff7b
4 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -199,3 +199,7 @@ endif()
 if(WITH_ASCEND_CL)
 cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
 endif()
+if (WITH_GPU OR WITH_ASCEND_CL)
+cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry copy_cross_scope_op scope device_context enforce executor)
+endif()
--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+using LoDTensor = paddle::framework::LoDTensor;
+using Tensor = paddle::framework::Tensor;
+namespace paddle {
+namespace operators {
+class CopyCrossScopeOp : public framework::OperatorBase {
+ public:
+  CopyCrossScopeOp(const std::string& type,
+                   const framework::VariableNameMap& inputs,
+                   const framework::VariableNameMap& outputs,
+                   const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext* ctx) const {}
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int num_micro_scopes = scope.kids().size();
+    int num_micro_batches = Attr<int>("num_micro_batches");
+    bool ToM = Attr<bool>("to_main_scope");
+    PADDLE_ENFORCE_EQ(num_micro_scopes, num_micro_batches,
+                      platform::errors::InvalidArgument(
+                          "For pipeline, number of micro scopes (%d) should "
+                          "be equal to number of micro batches (%d).",
+                          num_micro_scopes, num_micro_batches));
+    const std::string& id_name = Input("Id");
+    auto* id_var = scope.FindVar(id_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        id_var,
+        platform::errors::NotFound("No variable with name %s found.", id_name));
+    auto id_tensor = id_var->GetMutable<LoDTensor>();
+    auto it = scope.kids().begin();
+    framework::Tensor cpu_id_tensor;
+    TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor);
+    auto id_value = cpu_id_tensor.data<int64_t>();
+    for (auto i = 0; i < *id_value; i++) {
+      it++;
+    }
+    if (it == scope.kids().end()) {
+      if (ToM) {
+        auto dst_scope = *it;
+        const std::string& x_name = Input("X");
+        auto* dst_var = dst_scope->FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            dst_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in source scope.", x_name));
+        auto* main_var = scope.FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            main_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in destination scope.",
+                x_name));
+        auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+        auto main_tensor = main_var->GetMutable<LoDTensor>();
+        TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+      }
+      return;
+    }
+    auto source_scope = *it;
+    it++;
+    auto dst_scope = *it;
+    const std::string& x_name = Input("X");
+    auto* source_var = source_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        source_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in source scope.", x_name));
+    auto* dst_var = dst_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        dst_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in destination scope.", x_name));
+    auto src_tensor = source_var->GetMutable<LoDTensor>();
+    auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+    TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor);
+    if (ToM) {
+      auto* main_var = scope.FindVar(x_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          main_var,
+          platform::errors::NotFound(
+              "No variable with name %s found in destination scope.", x_name));
+      auto main_tensor = main_var->GetMutable<LoDTensor>();
+      TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+    }
+  }
+};
+class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), The first input tensor of copy_cross_scope op, which "
+             "is copying micro scope.");
+    AddInput("Id",
+             "(Tensor), The second input tensor of copy_cross_scope op, which "
+             "is a id of the current micro scope.");
+    AddAttr<bool>("to_main_scope", "Return current scope to main scope.")
+        .SetDefault(false);
+    AddAttr<int>("num_micro_batches", "Number of micro batches for pipeline.");
+    AddComment(R"DOC(
+      This op is used by pipeline to copy tensors across micro batch scopes. 
+      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. 
+      If need to copy back to the main scope, using to_main_scope option to copy the variable value of 
+      the current micro scope to the main scope.
+    )DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(copy_cross_scope, ops::CopyCrossScopeOp,
+                             ops::CopyCrossScopeOpMaker);
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+#include <iostream>
+#include <list>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/copy_cross_scope_op.cc"
+#include "paddle/fluid/string/printf.h"
+#define Conn(x, y) x##y
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+USE_NO_KERNEL_OP(copy_cross_scope);
+template <typename T>
+void Compare1(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {1};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+  ctx.Wait();
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+  std::list<f::Scope*>::const_iterator iter = scope->kids().begin();
+  iter++;
+  iter++;
+  auto* kid_scope = *iter;
+  auto* dst_var = kid_scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  int expected = 1;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+template <typename T>
+void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {0};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+  ctx.Wait();
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+  auto* dst_var = scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  int expected = 0;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(copy_cross_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#elif PADDLE_WITH_ASCEND_CL
+TEST(copy_cross_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+TEST(copy_cross_scope_to_main_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#endif
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2254,7 +2254,8 @@ class Operator(object):
        'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
        'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
        'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
-        'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl'
+        'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl',
+        'copy_cross_scope'
    }
    def __init__(self,